aboutsummaryrefslogtreecommitdiff
path: root/lib/lib_rss.php
diff options
context:
space:
mode:
Diffstat (limited to 'lib/lib_rss.php')
-rw-r--r--lib/lib_rss.php17
1 files changed, 13 insertions, 4 deletions
diff --git a/lib/lib_rss.php b/lib/lib_rss.php
index 6669b36dc..4ce9e7bfd 100644
--- a/lib/lib_rss.php
+++ b/lib/lib_rss.php
@@ -444,8 +444,14 @@ function stripHtmlMetaCharset(string $html): string {
function enforceHttpEncoding(string $html, string $contentType = ''): string {
$httpCharset = preg_match('/\bcharset=([0-9a-z_-]{2,12})$/i', $contentType, $matches) === 1 ? $matches[1] : '';
if ($httpCharset == '') {
- // No charset defined by HTTP, do nothing
- return $html;
+ // No charset defined by HTTP
+ if (preg_match('/<meta\s[^>]*charset\s*=[\s\'"]*UTF-?8\b/i', substr($html, 0, 2048))) {
+ // Detect UTF-8 even if declared too deep in HTML for DOMDocument
+ $httpCharset = 'UTF-8';
+ } else {
+ // Do nothing
+ return $html;
+ }
}
$httpCharsetNormalized = SimplePie_Misc::encoding($httpCharset);
if (in_array($httpCharsetNormalized, ['windows-1252', 'US-ASCII'], true)) {
@@ -565,8 +571,11 @@ function httpGet(string $url, string $cachePath, string $type = 'html', array $a
// TODO: Implement HTTP 410 Gone
} elseif (!is_string($body) || strlen($body) === 0) {
$body = '';
- } elseif ($type !== 'json') {
- $body = enforceHttpEncoding($body, $c_content_type);
+ } else {
+ $body = trim($body, " \n\r\t\v"); // Do not trim \x00 to avoid breaking a BOM
+ if ($type !== 'json') {
+ $body = enforceHttpEncoding($body, $c_content_type);
+ }
}
if (file_put_contents($cachePath, $body) === false) {