aboutsummaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorGravatar Alexandre Alapetite <alexandre@alapetite.fr> 2022-02-28 20:22:43 +0100
committerGravatar GitHub <noreply@github.com> 2022-02-28 20:22:43 +0100
commit1fe66ad020ca8f0560bb9c6e311852ed77228f78 (patch)
treedf78da3f33a9f13a9d6ba3f2744c369bd6e313a6 /lib
parentfa23ae76ea46b329fb65329081df95e864b03b23 (diff)
Implement Web scraping "HTML + XPath" (#4220)
* More PHP type hints for Fever Follow-up of https://github.com/FreshRSS/FreshRSS/pull/4201 Related to https://github.com/FreshRSS/FreshRSS/issues/4200 * Detail * Draft * Progress * More draft * Fix thumbnail PHP type hint https://github.com/FreshRSS/FreshRSS/issues/4215 * More types * A bit more * Refactor FreshRSS_Entry::fromArray * Progress * Starts to work * Categories * Fonctional * Layout update * Fix relative URLs * Cache system * Forgotten files * Remove a debug line * Automatic form validation of XPath expressions * data-leave-validation * Fix reload action * Simpler examples * Fix column type for PostgreSQL * Enforce HTTP encoding * Readme * Fix get full content * target="_blank" * gitignore * htmlspecialchars_utf8 * Implement HTML <base> And fix/revert `xml:base` support in SimplePie https://github.com/simplepie/simplepie/commit/e49c578817aa504d8d05cd7f33857aeda9d41908 * SimplePie upstream PR merged https://github.com/simplepie/simplepie/pull/723
Diffstat (limited to 'lib')
-rw-r--r--lib/Minz/Url.php7
-rw-r--r--lib/Minz/View.php6
-rw-r--r--lib/SimplePie/SimplePie.php2
-rw-r--r--lib/lib_phpQuery.php3
-rw-r--r--lib/lib_rss.php127
5 files changed, 139 insertions, 6 deletions
diff --git a/lib/Minz/Url.php b/lib/Minz/Url.php
index be3184b40..40cadb49a 100644
--- a/lib/Minz/Url.php
+++ b/lib/Minz/Url.php
@@ -121,7 +121,8 @@ class Minz_Url {
/**
* @param string $controller
* @param string $action
- * @param string ...$args
+ * @param string|int ...$args
+ * @return string|false
*/
function _url ($controller, $action, ...$args) {
$nb_args = count($args);
@@ -132,8 +133,8 @@ function _url ($controller, $action, ...$args) {
$params = array ();
for ($i = 0; $i < $nb_args; $i += 2) {
- $arg = $args[$i];
- $params[$arg] = $args[$i + 1];
+ $arg = '' . $args[$i];
+ $params[$arg] = '' . $args[$i + 1];
}
return Minz_Url::display (array ('c' => $controller, 'a' => $action, 'params' => $params));
diff --git a/lib/Minz/View.php b/lib/Minz/View.php
index 431a8b700..6cf811bff 100644
--- a/lib/Minz/View.php
+++ b/lib/Minz/View.php
@@ -112,6 +112,12 @@ class Minz_View {
}
}
+ public function renderToString(): string {
+ ob_start();
+ $this->render();
+ return ob_get_clean();
+ }
+
/**
* Ajoute un élément du layout
* @param string $part l'élément partial à ajouter
diff --git a/lib/SimplePie/SimplePie.php b/lib/SimplePie/SimplePie.php
index b0e973e83..bf4a66bb4 100644
--- a/lib/SimplePie/SimplePie.php
+++ b/lib/SimplePie/SimplePie.php
@@ -2275,7 +2275,7 @@ class SimplePie
*/
public function get_base($element = array())
{
- if (!($this->get_type() & SIMPLEPIE_TYPE_RSS_SYNDICATION) && !empty($element['xml_base_explicit']) && isset($element['xml_base']))
+ if (!empty($element['xml_base_explicit']) && isset($element['xml_base']))
{
return $element['xml_base'];
}
diff --git a/lib/lib_phpQuery.php b/lib/lib_phpQuery.php
index 411aa120c..1fabfcb6d 100644
--- a/lib/lib_phpQuery.php
+++ b/lib/lib_phpQuery.php
@@ -436,7 +436,8 @@ class DOMDocumentWrapper {
}
protected function isXML($markup) {
// return strpos($markup, '<?xml') !== false && stripos($markup, 'xhtml') === false;
- return strpos(substr($markup, 0, 100), '<'.'?xml') !== false;
+ $head = substr($markup, 0, 100);
+ return strpos($head, '<'.'?xml') !== false && stripos($head, '<html ') === false;
}
protected function contentTypeToArray($contentType) {
$matches = explode(';', trim(strtolower($contentType)));
diff --git a/lib/lib_rss.php b/lib/lib_rss.php
index e020236ea..4e415d857 100644
--- a/lib/lib_rss.php
+++ b/lib/lib_rss.php
@@ -218,6 +218,7 @@ function customSimplePie($attributes = array()): SimplePie {
$simplePie->set_cache_name_function('sha1');
$simplePie->set_cache_location(CACHE_PATH);
$simplePie->set_cache_duration($limits['cache_duration']);
+ $simplePie->enable_order_by_date(false);
$feed_timeout = empty($attributes['timeout']) ? 0 : intval($attributes['timeout']);
$simplePie->set_timeout($feed_timeout > 0 ? $feed_timeout : $limits['timeout']);
@@ -290,7 +291,10 @@ function customSimplePie($attributes = array()): SimplePie {
return $simplePie;
}
-function sanitizeHTML($data, $base = '', $maxLength = false) {
+/**
+ * @param int|false $maxLength
+ */
+function sanitizeHTML($data, string $base = '', $maxLength = false) {
if (!is_string($data) || ($maxLength !== false && $maxLength <= 0)) {
return '';
}
@@ -311,6 +315,127 @@ function sanitizeHTML($data, $base = '', $maxLength = false) {
return $result;
}
+function cleanCache(int $hours = 720) {
+ $files = glob(CACHE_PATH . '/*.{html,spc}', GLOB_BRACE | GLOB_NOSORT);
+ foreach ($files as $file) {
+ if (substr($file, -10) === 'index.html') {
+ continue;
+ }
+ $cacheMtime = @filemtime($file);
+ if ($cacheMtime !== false && $cacheMtime < time() - (3600 * $hours)) {
+ unlink($file);
+ }
+ }
+}
+
+/**
+ * Set an XML preamble to enforce the HTML content type charset received by HTTP.
+ * @param string $html the row downloaded HTML content
+ * @param string $contentType an HTTP Content-Type such as 'text/html; charset=utf-8'
+ * @return string an HTML string with XML encoding information for DOMDocument::loadHTML()
+ */
+function enforceHttpEncoding(string $html, string $contentType = ''): string {
+ $httpCharset = preg_match('/\bcharset=([0-9a-z_-]{2,12})$/i', $contentType, $matches) === false ? '' : $matches[1] ?? '';
+ if ($httpCharset == '') {
+ // No charset defined by HTTP, do nothing
+ return $html;
+ }
+ $httpCharsetNormalized = SimplePie_Misc::encoding($httpCharset);
+ if ($httpCharsetNormalized === 'windows-1252') {
+ // Default charset for HTTP, do nothing
+ return $html;
+ }
+ if (substr($html, 0, 3) === "\xEF\xBB\xBF" || // UTF-8 BOM
+ substr($html, 0, 2) === "\xFF\xFE" || // UTF-16 Little Endian BOM
+ substr($html, 0, 2) === "\xFE\xFF" || // UTF-16 Big Endian BOM
+ substr($html, 0, 4) === "\xFF\xFE\x00\x00" || // UTF-32 Little Endian BOM
+ substr($html, 0, 4) === "\x00\x00\xFE\xFF") { // UTF-32 Big Endian BOM
+ // Existing byte order mark, do nothing
+ return $html;
+ }
+ if (preg_match('/^<[?]xml[^>]+encoding\b/', substr($html, 0, 64))) {
+ // Existing XML declaration, do nothing
+ return $html;
+ }
+ return '<' . '?xml version="1.0" encoding="' . $httpCharsetNormalized . '" ?' . ">\n" . $html;
+}
+
+/**
+ * @param array<string,mixed> $attributes
+ */
+function getHtml(string $url, array $attributes = []): string {
+ $limits = FreshRSS_Context::$system_conf->limits;
+ $feed_timeout = empty($attributes['timeout']) ? 0 : intval($attributes['timeout']);
+
+ $cachePath = FreshRSS_Feed::cacheFilename($url, $attributes, FreshRSS_Feed::KIND_HTML_XPATH);
+ $cacheMtime = @filemtime($cachePath);
+ if ($cacheMtime !== false && $cacheMtime > time() - intval($limits['cache_duration'])) {
+ $html = @file_get_contents($cachePath);
+ if ($html != '') {
+ syslog(LOG_DEBUG, 'FreshRSS uses cache for ' . SimplePie_Misc::url_remove_credentials($url));
+ return $html;
+ }
+ }
+
+ if (mt_rand(0, 30) === 1) { // Remove old entries once in a while
+ cleanCache();
+ }
+
+ if (FreshRSS_Context::$system_conf->simplepie_syslog_enabled) {
+ syslog(LOG_INFO, 'FreshRSS GET ' . SimplePie_Misc::url_remove_credentials($url));
+ }
+
+ // TODO: Implement HTTP 1.1 conditional GET If-Modified-Since
+ $ch = curl_init();
+ curl_setopt_array($ch, [
+ CURLOPT_URL => $url,
+ CURLOPT_REFERER => SimplePie_Misc::url_remove_credentials($url),
+ CURLOPT_HTTPHEADER => array('Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'),
+ CURLOPT_USERAGENT => FRESHRSS_USERAGENT,
+ CURLOPT_CONNECTTIMEOUT => $feed_timeout > 0 ? $feed_timeout : $limits['timeout'],
+ CURLOPT_TIMEOUT => $feed_timeout > 0 ? $feed_timeout : $limits['timeout'],
+ //CURLOPT_FAILONERROR => true;
+ CURLOPT_MAXREDIRS => 4,
+ CURLOPT_RETURNTRANSFER => true,
+ CURLOPT_FOLLOWLOCATION => true,
+ CURLOPT_ENCODING => '', //Enable all encodings
+ ]);
+
+ curl_setopt_array($ch, FreshRSS_Context::$system_conf->curl_options);
+
+ if (isset($attributes['curl_params']) && is_array($attributes['curl_params'])) {
+ curl_setopt_array($ch, $attributes['curl_params']);
+ }
+
+ if (isset($attributes['ssl_verify'])) {
+ curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, $attributes['ssl_verify'] ? 2 : 0);
+ curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, $attributes['ssl_verify'] ? true : false);
+ if (!$attributes['ssl_verify']) {
+ curl_setopt($ch, CURLOPT_SSL_CIPHER_LIST, 'DEFAULT@SECLEVEL=1');
+ }
+ }
+ $html = curl_exec($ch);
+ $c_status = curl_getinfo($ch, CURLINFO_HTTP_CODE);
+ $c_content_type = curl_getinfo($ch, CURLINFO_CONTENT_TYPE); //TODO: Check if that may be null
+ $c_error = curl_error($ch);
+ curl_close($ch);
+
+ if ($c_status != 200 || $c_error != '' || $html === false) {
+ Minz_Log::warning('Error fetching content: HTTP code ' . $c_status . ': ' . $c_error . ' ' . $url);
+ }
+ if ($html == false) {
+ $html = '';
+ } else {
+ $html = enforceHttpEncoding($html, $c_content_type);
+ }
+
+ if (file_put_contents($cachePath, $html) === false) {
+ Minz_Log::warning("Error saving cache $cachePath for $url");
+ }
+
+ return $html;
+}
+
/**
* Validate an email address, supports internationalized addresses.
*