<?php declare(strict_types=1);
namespace Newland\Toubiz\Api\Service;

use Newland\Toubiz\Api\Exception\StringCleaningException;
use function Safe\preg_replace;

class StringCleaner
{
    private $whiteSpaceCharacters = [ ' ', "\t", "\n", "\r", "\0", "\x0B", "\xA0" ];
    private $encodedWhiteSpace = [ '&nbsp;', '&#a0', '&#160;' ];

    public function __construct()
    {
        $this->whiteSpaceCharacters[] = chr(160);
        $this->whiteSpaceCharacters[] = chr(194);
    }

    public static function asString(string $string): string
    {
        try {
            // Ensure that all existent code points are valid.
            $string = static::replaceInvalidUtf8CodePoints($string, ' ');

            // Remove control characters and unused code points (requires valid UTF-8)
            $string = static::regexReplace('/\p{C}/u', ' ', $string);

            // Replace various kinds of whitespace with a single space
            $string = static::regexReplace('/\s+/u', ' ', $string);

            return $string;
        } catch (\Throwable $e) {
            throw StringCleaningException::create($string, $e);
        }
    }

    /**
     * Replaces octets in the given string that do not form valid UTF-8 characters.
     *
     * A codepoint in UTF-8 describes a single character. UTF-8 uses a pagination approach in order to
     * let often used characters use less space while still being able to accomadate thousands of code
     * points. This way a single codepoint can consist of between 1-4 octets. To do this the most significant
     * bits of an octet are used to signal information about the pagination.
     *
     *
     * - If an octet begins with `0xxxxxx` then this octet is a standalone code point
     *      - The lowest standalone code point is \x00, the highest \x7F
     *
     * - if an octet begins with `110xxxx` then it is expected that another octet starting with
     *   `10xxxxxx` follows. Both octets together are the full sequence.
     *     - The lowest octet containing a 2-page indicator is \xC0 while the highest is \xDF
     *
     * - if an octet begins with `1110xxx` then it is expected 2 other octets starting with
     *   `10xxxxxx` follow. All octets together are the full sequence.
     *     - The lowest octet containing a 3-page indicator is \xE0 while the highest is \xEF
     *
     * - if an octet begins with `11110xx` then it is expected 2 other octets starting with
     *   `10xxxxxx` follow. All octets together are the full sequence.
     *      - The lowest octet containing a 4-page indicator is \xF0 while the highest is \xF7
     *
     * - The lowest octet containing a following page indicator (`10xxxxxx`) is \x80 while the
     *   highest is \xBF.
     *
     * See the table on https://en.wikipedia.org/wiki/UTF-8 for an illustration.
     *
     * This fact also means however:
     * - That any octet starting with `10xxxxxx` that is not preceeded by a pagination indicator is invalid.
     * - That any octet starting with `110xxxx`, `1110xxx` or `11110xx` not followed by the appropriate
     *   number of pagination indicators (`10xxxxx`) is invalid
     *
     * @param string $string
     * @param string $replaceWith
     * @return string
     */
    protected static function replaceInvalidUtf8CodePoints(string $string, string $replaceWith): string
    {
        // 2-page indicator without 1 page behind it
        $string = static::regexReplace('/[\xC0-\xDF](?![\x80-\xBF])/', $replaceWith, $string);

        // 3-page indicator without 2 pages behind it
        $string = static::regexReplace('/[\xE0-\xEF](?![\x80-\xBF][\x80-\xBF])/', $replaceWith, $string);

        // 4-page indicator without 3 pages behind it
        $string = static::regexReplace('/[\xF0-\xF7](?![\x80-\xBF][\x80-\xBF][\x80-\xBF])/', $replaceWith, $string);

        // Paginated character without either another paginated character or page indicator in front of it.
        $string = static::regexReplace('/(?<!([\xC0-\xF7]|[\x80-\xBF]))[\x80-\xBF]/', $replaceWith, $string);

        return $string;
    }

    private static function regexReplace(string $regex, string $replace, string $subject): string
    {
        $result = preg_replace($regex, $replace, $subject);

        if (\is_array($result)) {
            $result = implode('', $result);
        }

        return $result;
    }

    /**
     * @param string $input
     * @return string
     */
    public function cleanHtmlString(string $input): string
    {
        try {
            $isEmpty = trim(strip_tags($input)) === '';
            $output = $isEmpty ? '' : $input;
            $output = $this->replaceEncodedWhiteSpaceWihActualSpaces($output);

            return $this->cleanWhiteSpace($output);
        } catch (\Throwable $e) {
            throw StringCleaningException::create($input, $e);
        }
    }

    /**
     * @param string $value Input string to clean.
     * @return string           Filtered according to the chosen configuration rules.
     */
    public function purifyHtml(string $value): string
    {
        try {
            $purifierConfig = \HTMLPurifier_Config::createDefault();

            foreach ($this->getFilteringConfig() as $key => $val) {
                if (is_array($val)) {
                    $purifierConfig->set($key, implode(',', $val));
                } else {
                    $purifierConfig->set($key, $val);
                }
            }

            $def = $purifierConfig->getHTMLDefinition(true);
            $def->addAttribute('a', 'href*', 'URI');

            return (new \HTMLPurifier($purifierConfig))->purify($value);
        } catch (\Throwable $e) {
            throw StringCleaningException::create($value, $e);
        }
    }

    protected function getFilteringConfig(): array
    {
        return [
            /*
             * Array of allowed HTML Elements.
             * If an element is not allowed, its contents will be preserved, but the element will be removed.
             * @see http://htmlpurifier.org/live/configdoc/plain.html#HTML.AllowedElements
             */
            'HTML.AllowedElements' => [
                'p',
                'b',
                'a',
                'ul',
                'ol',
                'li',
                'strong',
                'em',
                'u',
                'br',
                'h2',
                'h3',
                'h4',
                'h5',
                'table',
                'tr',
                'td',
                'th',
            ],

            /*
             * Array of allowed HTML Attributes.
             * @see http://htmlpurifier.org/live/configdoc/plain.html#HTML.AllowedAttributes
             */
            'HTML.AllowedAttributes' => [
                'a.href',
                'a.title',
                'a.target',
                'table.cellpadding',
                'table.cellspacing',
                'td.colspan',
                'td.rowspan',
                'td.valign',
            ],

            /*
             * Array of allowed target values.
             * @see http://htmlpurifier.org/live/configdoc/plain.html#Attr.AllowedFrameTargets
             */
            'Attr.AllowedFrameTargets' => [
                '_blank',
                '_top',
                '_self',
                '_parent',
            ],

            /*
             * Array of allowed classes.
             * @see http://htmlpurifier.org/live/configdoc/plain.html#Attr.AllowedClasses
             */
            'Attr.AllowedClasses' => [],

            /*
             * Allowed CSS Properties.
             * @see http://htmlpurifier.org/live/configdoc/plain.html#CSS.AllowedProperties
             */
            'CSS.AllowedProperties' => [],

            /*
             * Removes empty tags that have no semantic meaning.
             * @see http://htmlpurifier.org/live/configdoc/plain.html#AutoFormat.RemoveEmpty
             */
            'AutoFormat.RemoveEmpty' => true,

            /*
             * Automatically links URLs that were not linked before.
             * Will transform 'http://google.com' to '<a href="http://google.com">http://google.com</a>'
             * @see http://htmlpurifier.org/live/configdoc/plain.html#AutoFormat.Linkify
             */
            'AutoFormat.Linkify' => true,
        ];
    }

    private function replaceEncodedWhiteSpaceWihActualSpaces(string $input): string
    {
        $parts = array_map(
            function (string $part) {
                return preg_quote($part, '/');
            },
            $this->encodedWhiteSpace
        );
        $regex = '/(' . implode('|', $parts) . ')/i';

        $input = static::regexReplace($regex, ' ', $input);

        return $input;
    }

    public function cleanWhiteSpace(string $input): string
    {
        try {
            $lines = explode("\n", $input);
            $whiteSpaceCharacters = $this->whiteSpaceCharacters;
            array_walk(
                $lines,
                function (&$line) use ($whiteSpaceCharacters) {
                    $line = $this->replaceEncodedWhiteSpaceWihActualSpaces($line);
                    $line = trim($line, implode('', $whiteSpaceCharacters));
                    $line = preg_replace('/\s+/', ' ', $line);
                }
            );

            $string = implode(' ', $lines);
            return static::regexReplace('/\s+/u', ' ', $string);
        } catch (\Throwable $e) {
            throw StringCleaningException::create($input, $e);
        }
    }
}
