aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Alexandre Alapetite <alexandre@alapetite.fr> 2024-04-06 23:02:50 +0200
committerGravatar GitHub <noreply@github.com> 2024-04-06 23:02:50 +0200
commite3c86a164d9903a99f10affae095f350e4075287 (patch)
tree454ccc91fc8627a3072cdcaa57e93efc330f3de1
parent1c684a91d2014610f971212a200c311214138d6d (diff)
HTTP Get allow UTF-8 even when charset is far from top (#6271)
* HTTP Get allow UTF-8 even when charset is far from top fix https://github.com/FreshRSS/FreshRSS/issues/5586 The case was an HTML document with 15k whitespace then 1.2k of scripts before the `<meta charset="utf-8">` (far from the 1024 bytes suggested by the spec..., and too far for DOMDocument) * Rewording * Trim also vertical tab + comment
-rw-r--r--lib/lib_rss.php17
1 files changed, 13 insertions, 4 deletions
diff --git a/lib/lib_rss.php b/lib/lib_rss.php
index 6669b36dc..4ce9e7bfd 100644
--- a/lib/lib_rss.php
+++ b/lib/lib_rss.php
@@ -444,8 +444,14 @@ function stripHtmlMetaCharset(string $html): string {
function enforceHttpEncoding(string $html, string $contentType = ''): string {
$httpCharset = preg_match('/\bcharset=([0-9a-z_-]{2,12})$/i', $contentType, $matches) === 1 ? $matches[1] : '';
if ($httpCharset == '') {
- // No charset defined by HTTP, do nothing
- return $html;
+ // No charset defined by HTTP
+ if (preg_match('/<meta\s[^>]*charset\s*=[\s\'"]*UTF-?8\b/i', substr($html, 0, 2048))) {
+ // Detect UTF-8 even if declared too deep in HTML for DOMDocument
+ $httpCharset = 'UTF-8';
+ } else {
+ // Do nothing
+ return $html;
+ }
}
$httpCharsetNormalized = SimplePie_Misc::encoding($httpCharset);
if (in_array($httpCharsetNormalized, ['windows-1252', 'US-ASCII'], true)) {
@@ -565,8 +571,11 @@ function httpGet(string $url, string $cachePath, string $type = 'html', array $a
// TODO: Implement HTTP 410 Gone
} elseif (!is_string($body) || strlen($body) === 0) {
$body = '';
- } elseif ($type !== 'json') {
- $body = enforceHttpEncoding($body, $c_content_type);
+ } else {
+ $body = trim($body, " \n\r\t\v"); // Do not trim \x00 to avoid breaking a BOM
+ if ($type !== 'json') {
+ $body = enforceHttpEncoding($body, $c_content_type);
+ }
}
if (file_put_contents($cachePath, $body) === false) {