From 7d6a64a52243838e37ed47289b73574cfcd3b356 Mon Sep 17 00:00:00 2001 From: Alexandre Alapetite Date: Sun, 18 Feb 2024 10:53:44 +0100 Subject: Web scraping support encodings such as EUC-JP (#6112) * Web scraping support encodings such as EUC-JP fix https://github.com/FreshRSS/FreshRSS/issues/6106 * Typo --- lib/lib_rss.php | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) (limited to 'lib') diff --git a/lib/lib_rss.php b/lib/lib_rss.php index 9dfb26405..e01630316 100644 --- a/lib/lib_rss.php +++ b/lib/lib_rss.php @@ -368,9 +368,18 @@ function cleanCache(int $hours = 720): void { } } +/** + * Remove the charset meta information of an HTML document, e.g.: + * `` + * `` + */ +function stripHtmlMetaCharset(string $html): string { + return preg_replace('/]*charset\s*=\s*[^>]+>/i', '', $html, 1) ?? ''; +} + /** * Set an XML preamble to enforce the HTML content type charset received by HTTP. - * @param string $html the row downloaded HTML content + * @param string $html the raw downloaded HTML content * @param string $contentType an HTTP Content-Type such as 'text/html; charset=utf-8' * @return string an HTML string with XML encoding information for DOMDocument::loadHTML() */ @@ -381,7 +390,7 @@ function enforceHttpEncoding(string $html, string $contentType = ''): string { return $html; } $httpCharsetNormalized = SimplePie_Misc::encoding($httpCharset); - if ($httpCharsetNormalized === 'windows-1252') { + if (in_array($httpCharsetNormalized, ['windows-1252', 'US-ASCII'], true)) { // Default charset for HTTP, do nothing return $html; } @@ -397,7 +406,20 @@ function enforceHttpEncoding(string $html, string $contentType = ''): string { // Existing XML declaration, do nothing return $html; } - return '<' . '?xml version="1.0" encoding="' . $httpCharsetNormalized . '" ?' . ">\n" . $html; + if ($httpCharsetNormalized !== 'UTF-8') { + // Try to change encoding to UTF-8 using mbstring or iconv or intl + $utf8 = SimplePie_Misc::change_encoding($html, $httpCharsetNormalized, 'UTF-8'); + if (is_string($utf8)) { + $html = stripHtmlMetaCharset($utf8); + $httpCharsetNormalized = 'UTF-8'; + } + } + if ($httpCharsetNormalized === 'UTF-8') { + // Save encoding information as XML declaration + return '<' . '?xml version="1.0" encoding="' . $httpCharsetNormalized . '" ?' . ">\n" . $html; + } + // Give up + return $html; } /** -- cgit v1.2.3