aboutsummaryrefslogtreecommitdiff
path: root/app/Models/Entry.php
diff options
context:
space:
mode:
authorGravatar Alexandre Alapetite <alexandre@alapetite.fr> 2022-02-28 20:22:43 +0100
committerGravatar GitHub <noreply@github.com> 2022-02-28 20:22:43 +0100
commit1fe66ad020ca8f0560bb9c6e311852ed77228f78 (patch)
treedf78da3f33a9f13a9d6ba3f2744c369bd6e313a6 /app/Models/Entry.php
parentfa23ae76ea46b329fb65329081df95e864b03b23 (diff)
Implement Web scraping "HTML + XPath" (#4220)
* More PHP type hints for Fever Follow-up of https://github.com/FreshRSS/FreshRSS/pull/4201 Related to https://github.com/FreshRSS/FreshRSS/issues/4200 * Detail * Draft * Progress * More draft * Fix thumbnail PHP type hint https://github.com/FreshRSS/FreshRSS/issues/4215 * More types * A bit more * Refactor FreshRSS_Entry::fromArray * Progress * Starts to work * Categories * Fonctional * Layout update * Fix relative URLs * Cache system * Forgotten files * Remove a debug line * Automatic form validation of XPath expressions * data-leave-validation * Fix reload action * Simpler examples * Fix column type for PostgreSQL * Enforce HTTP encoding * Readme * Fix get full content * target="_blank" * gitignore * htmlspecialchars_utf8 * Implement HTML <base> And fix/revert `xml:base` support in SimplePie https://github.com/simplepie/simplepie/commit/e49c578817aa504d8d05cd7f33857aeda9d41908 * SimplePie upstream PR merged https://github.com/simplepie/simplepie/pull/723
Diffstat (limited to 'app/Models/Entry.php')
-rw-r--r--app/Models/Entry.php96
1 files changed, 49 insertions, 47 deletions
diff --git a/app/Models/Entry.php b/app/Models/Entry.php
index a190e505d..ab88d777a 100644
--- a/app/Models/Entry.php
+++ b/app/Models/Entry.php
@@ -59,6 +59,38 @@ class FreshRSS_Entry extends Minz_Model {
$this->_guid($guid);
}
+ /** @param array<string,mixed> $dao */
+ public static function fromArray(array $dao): FreshRSS_Entry {
+ if (!isset($dao['content'])) {
+ $dao['content'] = '';
+ }
+ if (isset($dao['thumbnail'])) {
+ $dao['content'] .= '<p class="enclosure-content"><img src="' . $dao['thumbnail'] . '" alt="" /></p>';
+ }
+ $entry = new FreshRSS_Entry(
+ $dao['id_feed'] ?? 0,
+ $dao['guid'] ?? '',
+ $dao['title'] ?? '',
+ $dao['author'] ?? '',
+ $dao['content'] ?? '',
+ $dao['link'] ?? '',
+ $dao['date'] ?? 0,
+ $dao['is_read'] ?? false,
+ $dao['is_favorite'] ?? false,
+ $dao['tags'] ?? ''
+ );
+ if (isset($dao['id'])) {
+ $entry->_id($dao['id']);
+ }
+ if (!empty($dao['timestamp'])) {
+ $entry->_date(strtotime($dao['timestamp']));
+ }
+ if (!empty($dao['categories'])) {
+ $entry->_tags($dao['categories']);
+ }
+ return $entry;
+ }
+
public function id(): string {
return $this->id;
}
@@ -83,6 +115,7 @@ class FreshRSS_Entry extends Minz_Model {
return $this->content;
}
+ /** @return array<array<string,string>> */
public function enclosures(bool $searchBodyImages = false): array {
$results = [];
try {
@@ -97,11 +130,20 @@ class FreshRSS_Entry extends Minz_Model {
if ($searchEnclosures) {
$enclosures = $xpath->query('//div[@class="enclosure"]/p[@class="enclosure-content"]/*[@src]');
foreach ($enclosures as $enclosure) {
- $results[] = [
+ $result = [
'url' => $enclosure->getAttribute('src'),
'type' => $enclosure->getAttribute('data-type'),
+ 'medium' => $enclosure->getAttribute('data-medium'),
'length' => $enclosure->getAttribute('data-length'),
];
+ if (empty($result['medium'])) {
+ switch (strtolower($enclosure->nodeName)) {
+ case 'img': $result['medium'] = 'image'; break;
+ case 'video': $result['medium'] = 'video'; break;
+ case 'audio': $result['medium'] = 'audio'; break;
+ }
+ }
+ $results[] = $result;
}
}
if ($searchBodyImages) {
@@ -432,52 +474,12 @@ class FreshRSS_Entry extends Minz_Model {
}
}
- public static function getContentByParsing(string $url, string $path, array $attributes = array(), int $maxRedirs = 3): string {
- $limits = FreshRSS_Context::$system_conf->limits;
- $feed_timeout = empty($attributes['timeout']) ? 0 : intval($attributes['timeout']);
-
- if (FreshRSS_Context::$system_conf->simplepie_syslog_enabled) {
- syslog(LOG_INFO, 'FreshRSS GET ' . SimplePie_Misc::url_remove_credentials($url));
- }
-
- $ch = curl_init();
- curl_setopt_array($ch, [
- CURLOPT_URL => $url,
- CURLOPT_REFERER => SimplePie_Misc::url_remove_credentials($url),
- CURLOPT_HTTPHEADER => array('Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'),
- CURLOPT_USERAGENT => FRESHRSS_USERAGENT,
- CURLOPT_CONNECTTIMEOUT => $feed_timeout > 0 ? $feed_timeout : $limits['timeout'],
- CURLOPT_TIMEOUT => $feed_timeout > 0 ? $feed_timeout : $limits['timeout'],
- //CURLOPT_FAILONERROR => true;
- CURLOPT_MAXREDIRS => 4,
- CURLOPT_RETURNTRANSFER => true,
- CURLOPT_FOLLOWLOCATION => true,
- CURLOPT_ENCODING => '', //Enable all encodings
- ]);
-
- curl_setopt_array($ch, FreshRSS_Context::$system_conf->curl_options);
-
- if (isset($attributes['curl_params']) && is_array($attributes['curl_params'])) {
- curl_setopt_array($ch, $attributes['curl_params']);
- }
-
- if (isset($attributes['ssl_verify'])) {
- curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, $attributes['ssl_verify'] ? 2 : 0);
- curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, $attributes['ssl_verify'] ? true : false);
- if (!$attributes['ssl_verify']) {
- curl_setopt($ch, CURLOPT_SSL_CIPHER_LIST, 'DEFAULT@SECLEVEL=1');
- }
- }
- $html = curl_exec($ch);
- $c_status = curl_getinfo($ch, CURLINFO_HTTP_CODE);
- $c_error = curl_error($ch);
- curl_close($ch);
-
- if ($c_status != 200 || $c_error != '') {
- Minz_Log::warning('Error fetching content: HTTP code ' . $c_status . ': ' . $c_error . ' ' . $url);
- }
-
- if (is_string($html) && strlen($html) > 0) {
+ /**
+ * @param array<string,mixed> $attributes
+ */
+ public static function getContentByParsing(string $url, string $path, array $attributes = [], int $maxRedirs = 3): string {
+ $html = getHtml($url, $attributes);
+ if (strlen($html) > 0) {
require_once(LIB_PATH . '/lib_phpQuery.php');
/**
* @var phpQueryObject @doc