<?php declare(strict_types=1);
namespace Newland\Toubiz\Api\Service;

use function Safe\preg_replace;

class StringCleaner
{
    private $whiteSpaceCharacters = [ ' ', "\t", "\n", "\r", "\0", "\x0B", "\xA0" ];
    private $encodedWhiteSpace = [ '&nbsp;', '&#a0' ];

    /**
     * @param string $input
     * @return string
     */
    public function cleanHtmlString(string $input): string
    {
        $isEmpty = trim(strip_tags($input)) === '';
        $output = $isEmpty ? '' : $input;
        $output = $this->replaceEncodedWhiteSpaceWihActualSpaces($output);

        return $this->cleanWhiteSpace($output);
    }

    private function replaceEncodedWhiteSpaceWihActualSpaces(string $input): string
    {
        $parts = array_map(
            function (string $part) {
                return preg_quote($part, '/');
            },
            $this->encodedWhiteSpace
        );
        $regex = '/(' . implode('|', $parts) . ')/i';

        return static::regexReplace($regex, ' ', $input);
    }

    private function cleanWhiteSpace(string $input): string
    {
        $input = trim($input, implode('', $this->whiteSpaceCharacters));
        return str_replace("\n", '', $input);
    }

    public static function asString(string $string): string
    {
        $string = static::removeInvalidUtf8Characters($string);

        return static::regexReplace('/[\pC\s]+/u', ' ', $string);
    }

    /**
     * After receiving invalid UTF code in the XML Response,
     * what looks like an part of the bits missing on the response or are just lost between service transactions.
     * The most clear example was a \xC2 missing the rest of the bits would be considered invalid UTF
     * but a regular preg_replace wont detect it.
     *
     * So after trying multiple solutions like:
     * mb_encoding, headers on requests, etc. We came across these articles.
     *
     * https://magp.ie/2011/01/06/remove-non-utf8-characters-from-string-with-php/
     *
     * https://webcollab.sourceforge.io/unicode.html
     *
     * Which presented the following function
     *
     * @param string $string
     * @return string
     */
    private static function removeInvalidUtf8Characters(string $string): string
    {
        // UTF-8 codepoints jump from non-paginated 0x7F to prefix-paginated 0xC2 0x80
        // Therefor anything between 0x80 - 0xC2 that is not preceeded by any other pagination
        // character is invalid.
        $string = static::regexReplace('/(?<![\xC2-\xFF])[\x80-\xC2]/', ' ', $string);

        // Reject overly long 2 byte sequences, as well as characters above U+10000 and replace with a space
        $string = static::regexReplace(
            // Control characters.
            '/[\x00-\x08\x10\x0B\x0C\x0E-\x19\x7F]' .

            // Any pagination Character that is not followed by a valid character
            // (either codepoint or other pagination character).
            '|[\xC2-\xDF]((?![\x80-\xBF])|[\x80-\xBF]{2,})' .

            '|[\x00-\x7F][\x80-\xBF]+' .
            '|([\xC0\xC1]|[\xF0-\xFF])[\x80-\xBF]*' .
            '|[\xE0-\xEF](([\x80-\xBF](?![\x80-\xBF]))|(?![\x80-\xBF]{2})|[\x80-\xBF]{3,})/S',
            ' ',
            $string
        );

        // Reject overly long 3 byte sequences and UTF-16 surrogates and replace with a space.
        $string = static::regexReplace(
            '/\xE0[\x80-\x9F][\x80-\xBF]' .
            '|\xED[\xA0-\xBF][\x80-\xBF]/S',
            ' ',
            $string
        );

        return $string;
    }

    private static function regexReplace(string $regex, string $replace, string $subject): string
    {
        $result = preg_replace($regex, $replace, $subject);

        if (\is_array($result)) {
            $result = implode('', $result);
        }

        return $result;
    }
}
