aboutsummaryrefslogtreecommitdiff
path: root/lib/lib_rss.php
diff options
context:
space:
mode:
authorGravatar Alexandre Alapetite <alexandre@alapetite.fr> 2024-02-18 10:53:44 +0100
committerGravatar GitHub <noreply@github.com> 2024-02-18 10:53:44 +0100
commit7d6a64a52243838e37ed47289b73574cfcd3b356 (patch)
tree36e9b5d14bdd0fff055bf76713bae5f40d8c3c0b /lib/lib_rss.php
parent53d40ea3bba2becb5ed19a09ca9e87dbde2d46e0 (diff)
Web scraping support encodings such as EUC-JP (#6112)
* Web scraping support encodings such as EUC-JP fix https://github.com/FreshRSS/FreshRSS/issues/6106 * Typo
Diffstat (limited to 'lib/lib_rss.php')
-rw-r--r--lib/lib_rss.php28
1 files changed, 25 insertions, 3 deletions
diff --git a/lib/lib_rss.php b/lib/lib_rss.php
index 9dfb26405..e01630316 100644
--- a/lib/lib_rss.php
+++ b/lib/lib_rss.php
@@ -369,8 +369,17 @@ function cleanCache(int $hours = 720): void {
}
/**
+ * Remove the charset meta information of an HTML document, e.g.:
+ * `<meta charset="..." />`
+ * `<meta http-equiv="Content-Type" content="text/html; charset=...">`
+ */
+function stripHtmlMetaCharset(string $html): string {
+ return preg_replace('/<meta\s[^>]*charset\s*=\s*[^>]+>/i', '', $html, 1) ?? '';
+}
+
+/**
* Set an XML preamble to enforce the HTML content type charset received by HTTP.
- * @param string $html the row downloaded HTML content
+ * @param string $html the raw downloaded HTML content
* @param string $contentType an HTTP Content-Type such as 'text/html; charset=utf-8'
* @return string an HTML string with XML encoding information for DOMDocument::loadHTML()
*/
@@ -381,7 +390,7 @@ function enforceHttpEncoding(string $html, string $contentType = ''): string {
return $html;
}
$httpCharsetNormalized = SimplePie_Misc::encoding($httpCharset);
- if ($httpCharsetNormalized === 'windows-1252') {
+ if (in_array($httpCharsetNormalized, ['windows-1252', 'US-ASCII'], true)) {
// Default charset for HTTP, do nothing
return $html;
}
@@ -397,7 +406,20 @@ function enforceHttpEncoding(string $html, string $contentType = ''): string {
// Existing XML declaration, do nothing
return $html;
}
- return '<' . '?xml version="1.0" encoding="' . $httpCharsetNormalized . '" ?' . ">\n" . $html;
+ if ($httpCharsetNormalized !== 'UTF-8') {
+ // Try to change encoding to UTF-8 using mbstring or iconv or intl
+ $utf8 = SimplePie_Misc::change_encoding($html, $httpCharsetNormalized, 'UTF-8');
+ if (is_string($utf8)) {
+ $html = stripHtmlMetaCharset($utf8);
+ $httpCharsetNormalized = 'UTF-8';
+ }
+ }
+ if ($httpCharsetNormalized === 'UTF-8') {
+ // Save encoding information as XML declaration
+ return '<' . '?xml version="1.0" encoding="' . $httpCharsetNormalized . '" ?' . ">\n" . $html;
+ }
+ // Give up
+ return $html;
}
/**