From b906d79d61beaeed7474f33fb6dac01a63ca90b9 Mon Sep 17 00:00:00 2001 From: Alexandre Alapetite Date: Thu, 21 May 2020 11:56:55 +0200 Subject: getContentByParsing follow HTML redirections (#2985) * getContentByParsing follow HTML redirections Add the ability to follow HTML redirections in getContentByParsing: ```html ``` * Better regex * Trim http-equiv --- app/Models/Entry.php | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'app/Models/Entry.php') diff --git a/app/Models/Entry.php b/app/Models/Entry.php index 2ce44c73d..b5328df90 100644 --- a/app/Models/Entry.php +++ b/app/Models/Entry.php @@ -352,7 +352,7 @@ class FreshRSS_Entry extends Minz_Model { } } - public static function getContentByParsing($url, $path, $attributes = array()) { + public static function getContentByParsing($url, $path, $attributes = array(), $maxRedirs = 3) { $system_conf = Minz_Configuration::get('system'); $limits = $system_conf->limits; $feed_timeout = empty($attributes['timeout']) ? 0 : intval($attributes['timeout']); @@ -392,6 +392,21 @@ class FreshRSS_Entry extends Minz_Model { if ($html) { require_once(LIB_PATH . '/lib_phpQuery.php'); $doc = phpQuery::newDocument($html); + + if ($maxRedirs > 0) { + //Follow any HTML redirection + $metas = $doc->find('meta[http-equiv][content]'); + foreach ($metas as $meta) { + if (strtolower(trim($meta->getAttribute('http-equiv'))) === 'refresh') { + $refresh = preg_replace('/^[0-9.; ]*\s*(url\s*=)?\s*/i', '', trim($meta->getAttribute('content'))); + $refresh = SimplePie_Misc::absolutize_url($refresh, $url); + if ($refresh != false && $refresh !== $url) { + return self::getContentByParsing($refresh, $path, $attributes, $maxRedirs - 1); + } + } + } + } + $content = $doc->find($path); return trim(sanitizeHTML($content->__toString(), $url)); } else { -- cgit v1.2.3