diff options
| author | 2024-04-06 23:02:50 +0200 | |
|---|---|---|
| committer | 2024-04-06 23:02:50 +0200 | |
| commit | e3c86a164d9903a99f10affae095f350e4075287 (patch) | |
| tree | 454ccc91fc8627a3072cdcaa57e93efc330f3de1 /lib/lib_rss.php | |
| parent | 1c684a91d2014610f971212a200c311214138d6d (diff) | |
HTTP Get allow UTF-8 even when charset is far from top (#6271)
* HTTP Get allow UTF-8 even when charset is far from top
fix https://github.com/FreshRSS/FreshRSS/issues/5586
The case was an HTML document with 15k whitespace then 1.2k of scripts before the `<meta charset="utf-8">` (far from the 1024 bytes suggested by the spec..., and too far for DOMDocument)
* Rewording
* Trim also vertical tab + comment
Diffstat (limited to 'lib/lib_rss.php')
| -rw-r--r-- | lib/lib_rss.php | 17 |
1 files changed, 13 insertions, 4 deletions
diff --git a/lib/lib_rss.php b/lib/lib_rss.php index 6669b36dc..4ce9e7bfd 100644 --- a/lib/lib_rss.php +++ b/lib/lib_rss.php @@ -444,8 +444,14 @@ function stripHtmlMetaCharset(string $html): string { function enforceHttpEncoding(string $html, string $contentType = ''): string { $httpCharset = preg_match('/\bcharset=([0-9a-z_-]{2,12})$/i', $contentType, $matches) === 1 ? $matches[1] : ''; if ($httpCharset == '') { - // No charset defined by HTTP, do nothing - return $html; + // No charset defined by HTTP + if (preg_match('/<meta\s[^>]*charset\s*=[\s\'"]*UTF-?8\b/i', substr($html, 0, 2048))) { + // Detect UTF-8 even if declared too deep in HTML for DOMDocument + $httpCharset = 'UTF-8'; + } else { + // Do nothing + return $html; + } } $httpCharsetNormalized = SimplePie_Misc::encoding($httpCharset); if (in_array($httpCharsetNormalized, ['windows-1252', 'US-ASCII'], true)) { @@ -565,8 +571,11 @@ function httpGet(string $url, string $cachePath, string $type = 'html', array $a // TODO: Implement HTTP 410 Gone } elseif (!is_string($body) || strlen($body) === 0) { $body = ''; - } elseif ($type !== 'json') { - $body = enforceHttpEncoding($body, $c_content_type); + } else { + $body = trim($body, " \n\r\t\v"); // Do not trim \x00 to avoid breaking a BOM + if ($type !== 'json') { + $body = enforceHttpEncoding($body, $c_content_type); + } } if (file_put_contents($cachePath, $body) === false) { |
