From ec2663812470820dc802628f9cb1b768f4f26fc6 Mon Sep 17 00:00:00 2001 From: Alexandre Alapetite Date: Mon, 8 Dec 2025 23:18:33 +0100 Subject: Fix saveHTML() scrambling encoding in newer libxml2 (#8296) fix https://github.com/FreshRSS/FreshRSS/pull/8279#issuecomment-3620674818 --- app/Utils/httpUtil.php | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'app') diff --git a/app/Utils/httpUtil.php b/app/Utils/httpUtil.php index e43891f34..f5cd95738 100644 --- a/app/Utils/httpUtil.php +++ b/app/Utils/httpUtil.php @@ -203,8 +203,8 @@ final class FreshRSS_http_Util { } } if ($httpCharsetNormalized === 'UTF-8') { - // Save encoding information as XML declaration - return '<' . '?xml version="1.0" encoding="' . $httpCharsetNormalized . '" ?' . ">\n" . $html; + // Save encoding information as Unicode BOM + return "\xEF\xBB\xBF" . $html; } // Give up return $html; @@ -241,7 +241,19 @@ final class FreshRSS_http_Util { $doc->documentElement->insertBefore($base, $doc->documentElement->firstChild); } } - return $doc->saveHTML() ?: $html; + + // Save the start of HTML because libxml2 saveHTML() risks scrambling it + $htmlPos = stripos($html, ' 512 ? '' : substr($html, 0, $htmlPos); + + $html = $doc->saveHTML() ?: $html; + if ($htmlStart !== '' && !str_starts_with($html, $htmlStart)) { + // libxml2 saveHTML() risks removing Unicode BOM and XML declaration, + // which affects future detection of charset encoding, so manually restore it + $htmlPos = stripos($html, ' 512 ? $html : $htmlStart . substr($html, $htmlPos); + } + return $html; } /** -- cgit v1.2.3