diff options
| author | 2023-01-18 10:12:21 +0100 | |
|---|---|---|
| committer | 2023-01-18 10:12:21 +0100 | |
| commit | daaa391e33c5d92e3dd91bb0b81ac420abed7097 (patch) | |
| tree | a3263c26ac90fb3115627e156eba580826acfd4f /lib/marienfressinaud/lib_opml/src/LibOpml/LibOpml.php | |
| parent | 216e39c3cc43061686981b96328796765d264d29 (diff) | |
tec: Update the lib_opml (#4403)
* fix: Fix undefined GLOB_BRACE on Alpine
The manual states that:
> Note: The GLOB_BRACE flag is not available on some non GNU systems,
> like Solaris or Alpine Linux.
This generated an error on Alpine.
Reference: https://www.php.net/manual/function.glob.php
* fix: List details of feeds for OPML exportation
The details are necessary to export the XPath information, the CSS full
content path and read actions filters.
* Update LibOpml to 0.4.0
* Refactor OPML importation to be more robust
First, it fixes two regressions introduced by the update of lib_opml:
- title attribute is used when text attribute is missing;
- the OPML category attribute is used as a fallback for feeds categories.
In a related way, if also fixes a problem when a feed had both a parent
category outline and a category attribute. Before, it only considered the
attribute as its category, but now it considers the parent outline.
Then, it counts category limit correctly by not increasing
`$nb_categories` if the category already exists.
* Exclude lib_opml from the CodeSniffer
* Fix variable names when logging some errors
* Fix catch of LibOpml Exception
* Make sure to declare the category
* Exclude lib_opml from PHPStan analyze
* Disable markdownlint for lib_opml
* Fix typos
* Use auto-loading and allow updates via Composer
* Fix broken links to lib_opml
* Bring back the ability to import the OPML frss:opmlUrl attribute
* Refactor the logs of OPML errors
* Update lib_opml to the version 0.5.0
Co-authored-by: Alexandre Alapetite <alexandre@alapetite.fr>
Diffstat (limited to 'lib/marienfressinaud/lib_opml/src/LibOpml/LibOpml.php')
| -rw-r--r-- | lib/marienfressinaud/lib_opml/src/LibOpml/LibOpml.php | 770 |
1 files changed, 770 insertions, 0 deletions
diff --git a/lib/marienfressinaud/lib_opml/src/LibOpml/LibOpml.php b/lib/marienfressinaud/lib_opml/src/LibOpml/LibOpml.php new file mode 100644 index 000000000..4ba0df821 --- /dev/null +++ b/lib/marienfressinaud/lib_opml/src/LibOpml/LibOpml.php @@ -0,0 +1,770 @@ +<?php + +namespace marienfressinaud\LibOpml; + +/** + * The LibOpml class provides the methods to read and write OPML files and + * strings. It transforms OPML files or strings to PHP arrays (or the reverse). + * + * How to read this file? + * + * The first methods are dedicated to the parsing, and the next ones to the + * reading. The three last methods are helpful methods, but you don't have to + * worry too much about them. + * + * The main methods are the public ones: parseFile, parseString and render. + * They call the other parse* and render* methods internally. + * + * These three main methods are available as functions (see the src/functions.php + * file). + * + * What's the array format? + * + * As said before, LibOpml transforms OPML to PHP arrays, or the reverse. The + * format is pretty simple. It contains four keys: + * + * - version: the version of the OPML; + * - namespaces: an array of namespaces used in the OPML, if any; + * - head: an array of OPML head elements, where keys are the names of the + * elements; + * - body: an array of arrays representing OPML outlines, where keys are the + * name of the attributes (the special @outlines key contains the sub-outlines). + * + * When rendering, only the body key is required (version will default to 2.0). + * + * Example: + * + * [ + * version => '2.0', + * namespaces => [], + * head => [ + * title => 'An OPML file' + * ], + * body => [ + * [ + * text => 'Newspapers', + * @outlines => [ + * [text => 'El PaĆs'], + * [text => 'Le Monde'], + * [text => 'The Guardian'], + * [text => 'The New York Times'], + * ] + * ] + * ] + * ] + * + * @see http://opml.org/spec2.opml + * + * @author Marien Fressinaud <dev@marienfressinaud.fr> + * @link https://framagit.org/marienfressinaud/lib_opml + * @license MIT + */ +class LibOpml +{ + /** + * The list of valid head elements. + */ + public const HEAD_ELEMENTS = [ + 'title', 'dateCreated', 'dateModified', 'ownerName', 'ownerEmail', + 'ownerId', 'docs', 'expansionState', 'vertScrollState', 'windowTop', + 'windowLeft', 'windowBottom', 'windowRight' + ]; + + /** + * The list of numeric head elements. + */ + public const NUMERIC_HEAD_ELEMENTS = [ + 'vertScrollState', + 'windowTop', + 'windowLeft', + 'windowBottom', + 'windowRight', + ]; + + /** @var boolean */ + private $strict = true; + + /** @var string */ + private $version = '2.0'; + + /** @var string[] */ + private $namespaces = []; + + /** + * @param bool $strict + * Set to true (default) to check for violations of the specification, + * false otherwise. + */ + public function __construct($strict = true) + { + $this->strict = $strict; + } + + /** + * Parse a XML file and return the corresponding array. + * + * @param string $filename + * The XML file to parse. + * + * @throws \marienfressinaud\LibOpml\Exception + * Raised if the file cannot be read. See also exceptions raised by the + * parseString method. + * + * @return array + * An array reflecting the OPML (the structure is described above). + */ + public function parseFile($filename) + { + $file_content = @file_get_contents($filename); + + if ($file_content === false) { + throw new Exception("OPML file {$filename} cannot be found or read"); + } + + return $this->parseString($file_content); + } + + /** + * Parse a XML string and return the corresponding array. + * + * @param string $xml + * The XML string to parse. + * + * @throws \marienfressinaud\LibOpml\Exception + * Raised if the XML cannot be parsed, if version is missing or + * invalid, if head is missing or contains invalid (or not parsable) + * elements, or if body is missing, empty or contain non outline + * elements. The exceptions (except XML parsing errors) are not raised + * if strict is false. See also exceptions raised by the parseOutline + * method. + * + * @return array + * An array reflecting the OPML (the structure is described above). + */ + public function parseString($xml) + { + $dom = new \DOMDocument(); + $dom->recover = true; + $dom->encoding = 'UTF-8'; + + try { + $result = @$dom->loadXML($xml); + } catch (\Exception | \Error $e) { + $result = false; + } + + if (!$result) { + throw new Exception('OPML string is not valid XML'); + } + + $opml_element = $dom->documentElement; + + // Load the custom namespaces of the document + $xpath = new \DOMXPath($dom); + $this->namespaces = []; + foreach ($xpath->query('//namespace::*') as $node) { + if ($node->prefix === 'xml') { + // This is the base namespace, we don't need to store it + continue; + } + + $this->namespaces[$node->prefix] = $node->namespaceURI; + } + + // Get the version of the document + $version = $opml_element->getAttribute('version'); + if (!$version) { + $this->throwExceptionIfStrict('OPML version attribute is required'); + } + + $version = trim($version); + if ($version === '1.1') { + $version = '1.0'; + } + + if ($version !== '1.0' && $version !== '2.0') { + $this->throwExceptionIfStrict('OPML supported versions are 1.0 and 2.0'); + } + + $this->version = $version; + + // Get head and body child elements + $head_elements = $opml_element->getElementsByTagName('head'); + $child_head_elements = []; + if (count($head_elements) === 1) { + $child_head_elements = $head_elements[0]->childNodes; + } else { + $this->throwExceptionIfStrict('OPML must contain one and only one head element'); + } + + $body_elements = $opml_element->getElementsByTagName('body'); + $child_body_elements = []; + if (count($body_elements) === 1) { + $child_body_elements = $body_elements[0]->childNodes; + } else { + $this->throwExceptionIfStrict('OPML must contain one and only one body element'); + } + + $array = [ + 'version' => $this->version, + 'namespaces' => $this->namespaces, + 'head' => [], + 'body' => [], + ]; + + // Load the child head elements in the head array + foreach ($child_head_elements as $child_head_element) { + if ($child_head_element->nodeType !== XML_ELEMENT_NODE) { + continue; + } + + $name = $child_head_element->nodeName; + $value = $child_head_element->nodeValue; + $namespaced = $child_head_element->namespaceURI !== null; + + if (!in_array($name, self::HEAD_ELEMENTS) && !$namespaced) { + $this->throwExceptionIfStrict( + "OPML head {$name} element is not part of the specification" + ); + } + + if ($name === 'dateCreated' || $name === 'dateModified') { + try { + $value = $this->parseDate($value); + } catch (\DomainException $e) { + $this->throwExceptionIfStrict( + "OPML head {$name} element must be a valid RFC822 or RFC1123 date" + ); + } + } elseif ($name === 'ownerEmail') { + // Testing email validity is hard. PHP filter_var() function is + // too strict compared to the RFC 822, so we can't use it. + if (strpos($value, '@') === false) { + $this->throwExceptionIfStrict( + 'OPML head ownerEmail element must be an email address' + ); + } + } elseif ($name === 'ownerId' || $name === 'docs') { + if (!$this->checkHttpAddress($value)) { + $this->throwExceptionIfStrict( + "OPML head {$name} element must be a HTTP address" + ); + } + } elseif ($name === 'expansionState') { + $numbers = explode(',', $value); + $value = array_map(function ($str_number) { + if (is_numeric($str_number)) { + return intval($str_number); + } else { + $this->throwExceptionIfStrict( + 'OPML head expansionState element must be a list of numbers' + ); + return $str_number; + } + }, $numbers); + } elseif (in_array($name, self::NUMERIC_HEAD_ELEMENTS)) { + if (is_numeric($value)) { + $value = intval($value); + } else { + $this->throwExceptionIfStrict("OPML head {$name} element must be a number"); + } + } + + $array['head'][$name] = $value; + } + + // Load the child body elements in the body array + foreach ($child_body_elements as $child_body_element) { + if ($child_body_element->nodeType !== XML_ELEMENT_NODE) { + continue; + } + + if ($child_body_element->nodeName === 'outline') { + $array['body'][] = $this->parseOutline($child_body_element); + } else { + $this->throwExceptionIfStrict( + 'OPML body element can only contain outline elements' + ); + } + } + + if (empty($array['body'])) { + $this->throwExceptionIfStrict( + 'OPML body element must contain at least one outline element' + ); + } + + return $array; + } + + /** + * Parse a XML element as an outline element and return the corresponding array. + * + * @param \DOMElement $outline_element + * The element to parse. + * + * @throws \marienfressinaud\LibOpml\Exception + * Raised if the outline contains non-outline elements, if it doesn't + * contain a text attribute (or if empty), if a special attribute is + * not parsable, or if type attribute requirements are not met. The + * exceptions are not raised if strict is false. The exception about + * missing text attribute is not raised if version is 1.0. + * + * @return array + * An array reflecting the OPML outline (the structure is described above). + */ + private function parseOutline($outline_element) + { + $outline = []; + + // Load the element attributes in the outline array + foreach ($outline_element->attributes as $outline_attribute) { + $name = $outline_attribute->nodeName; + $value = $outline_attribute->nodeValue; + + if ($name === 'created') { + try { + $value = $this->parseDate($value); + } catch (\DomainException $e) { + $this->throwExceptionIfStrict( + 'OPML outline created attribute must be a valid RFC822 or RFC1123 date' + ); + } + } elseif ($name === 'category') { + $categories = explode(',', $value); + $categories = array_map(function ($category) { + return trim($category); + }, $categories); + $value = $categories; + } elseif ($name === 'isComment' || $name === 'isBreakpoint') { + if ($value === 'true' || $value === 'false') { + $value = $value === 'true'; + } else { + $this->throwExceptionIfStrict( + "OPML outline {$name} attribute must be a boolean (true or false)" + ); + } + } elseif ($name === 'type') { + // type attribute is case-insensitive + $value = strtolower($value); + } + + $outline[$name] = $value; + } + + if (empty($outline['text']) && $this->version !== '1.0') { + $this->throwExceptionIfStrict( + 'OPML outline text attribute is required' + ); + } + + // Perform additional check based on the type of the outline + $type = $outline['type'] ?? ''; + if ($type === 'rss') { + if (empty($outline['xmlUrl'])) { + $this->throwExceptionIfStrict( + 'OPML outline xmlUrl attribute is required when type is "rss"' + ); + } elseif (!$this->checkHttpAddress($outline['xmlUrl'])) { + $this->throwExceptionIfStrict( + 'OPML outline xmlUrl attribute must be a HTTP address when type is "rss"' + ); + } + } elseif ($type === 'link' || $type === 'include') { + if (empty($outline['url'])) { + $this->throwExceptionIfStrict( + "OPML outline url attribute is required when type is \"{$type}\"" + ); + } elseif (!$this->checkHttpAddress($outline['url'])) { + $this->throwExceptionIfStrict( + "OPML outline url attribute must be a HTTP address when type is \"{$type}\"" + ); + } + } + + // Load the sub-outlines in a @outlines array + foreach ($outline_element->childNodes as $child_outline_element) { + if ($child_outline_element->nodeType !== XML_ELEMENT_NODE) { + continue; + } + + if ($child_outline_element->nodeName === 'outline') { + $outline['@outlines'][] = $this->parseOutline($child_outline_element); + } else { + $this->throwExceptionIfStrict( + 'OPML body element can only contain outline elements' + ); + } + } + + return $outline; + } + + /** + * Parse a value as a date. + * + * @param string $value + * + * @throws \DomainException + * Raised if the value cannot be parsed. + * + * @return \DateTime + */ + private function parseDate($value) + { + $formats = [ + \DateTimeInterface::RFC822, + \DateTimeInterface::RFC1123, + ]; + + foreach ($formats as $format) { + $date = date_create_from_format($format, $value); + if ($date !== false) { + return $date; + } + } + + throw new \DomainException('The argument cannot be parsed as a date'); + } + + /** + * Render an OPML array as a string or a \DOMDocument. + * + * @param array $array + * The array to render, it must follow the structure defined above. + * @param bool $as_dom_document + * Set to false (default) to return the array as a string, true to + * return as a \DOMDocument. + * + * @throws \marienfressinaud\LibOpml\Exception + * Raised if the `head` array contains unknown or invalid elements + * (i.e. not of correct type), or if the `body` array is missing or + * empty. The exceptions are not raised if strict is false. See also + * exceptions raised by the renderOutline method. + * + * @return string|\DOMDocument + * The XML string or DOM document corresponding to the given array. + */ + public function render($array, $as_dom_document = false) + { + $dom = new \DOMDocument('1.0', 'UTF-8'); + $opml_element = new \DOMElement('opml'); + $dom->appendChild($opml_element); + + // Set the version attribute of the OPML document + $version = $array['version'] ?? '2.0'; + + if ($version === '1.1') { + $version = '1.0'; + } + + if ($version !== '1.0' && $version !== '2.0') { + $this->throwExceptionIfStrict('OPML supported versions are 1.0 and 2.0'); + } + + $this->version = $version; + $opml_element->setAttribute('version', $this->version); + + // Declare the namespace on the opml element + $this->namespaces = $array['namespaces'] ?? []; + foreach ($this->namespaces as $prefix => $namespace) { + $opml_element->setAttributeNS( + 'http://www.w3.org/2000/xmlns/', + "xmlns:{$prefix}", + $namespace + ); + } + + // Add the head element to the OPML document. $array['head'] is + // optional but head tag will always exist in the final XML. + $head_element = new \DOMElement('head'); + $opml_element->appendChild($head_element); + if (isset($array['head'])) { + foreach ($array['head'] as $name => $value) { + $namespace = $this->getNamespace($name); + + if (!in_array($name, self::HEAD_ELEMENTS, true) && !$namespace) { + $this->throwExceptionIfStrict( + "OPML head {$name} element is not part of the specification" + ); + } + + if ($name === 'dateCreated' || $name === 'dateModified') { + if ($value instanceof \DateTimeInterface) { + $value = $value->format(\DateTimeInterface::RFC1123); + } else { + $this->throwExceptionIfStrict( + "OPML head {$name} element must be a DateTime" + ); + } + } elseif ($name === 'ownerEmail') { + // Testing email validity is hard. PHP filter_var() function is + // too strict compared to the RFC 822, so we can't use it. + if (strpos($value, '@') === false) { + $this->throwExceptionIfStrict( + 'OPML head ownerEmail element must be an email address' + ); + } + } elseif ($name === 'ownerId' || $name === 'docs') { + if (!$this->checkHttpAddress($value)) { + $this->throwExceptionIfStrict( + "OPML head {$name} element must be a HTTP address" + ); + } + } elseif ($name === 'expansionState') { + if (is_array($value)) { + foreach ($value as $number) { + if (!is_int($number)) { + $this->throwExceptionIfStrict( + 'OPML head expansionState element must be an array of integers' + ); + } + } + + $value = implode(', ', $value); + } else { + $this->throwExceptionIfStrict( + 'OPML head expansionState element must be an array of integers' + ); + } + } elseif (in_array($name, self::NUMERIC_HEAD_ELEMENTS)) { + if (!is_int($value)) { + $this->throwExceptionIfStrict( + "OPML head {$name} element must be an integer" + ); + } + } + + $child_head_element = new \DOMElement($name, $value, $namespace); + $head_element->appendChild($child_head_element); + } + } + + // Check body is set and contains at least one element + if (!isset($array['body'])) { + $this->throwExceptionIfStrict('OPML array must contain a body key'); + } + + $array_body = $array['body'] ?? []; + if (count($array_body) <= 0) { + $this->throwExceptionIfStrict( + 'OPML body element must contain at least one outline array' + ); + } + + // Create outline elements in the body element + $body_element = new \DOMElement('body'); + $opml_element->appendChild($body_element); + foreach ($array_body as $outline) { + $this->renderOutline($body_element, $outline); + } + + // And return the final result + if ($as_dom_document) { + return $dom; + } else { + $dom->formatOutput = true; + return $dom->saveXML(); + } + } + + /** + * Transform an outline array to a \DOMElement and add it to a parent element. + * + * @param \DOMElement $parent_element + * The DOM parent element of the current outline. + * @param array $outline + * The outline array to transform in a \DOMElement, it must follow the + * structure defined above. + * + * @throws \marienfressinaud\LibOpml\Exception + * Raised if the outline is not an array, if it doesn't contain a text + * attribute (or if empty), if the `@outlines` key is not an array, if + * a special attribute does not match its corresponding type, or if + * `type` key requirements are not met. The exceptions (except errors + * about outline or suboutlines not being arrays) are not raised if + * strict is false. The exception about missing text attribute is not + * raised if version is 1.0. + */ + private function renderOutline($parent_element, $outline) + { + // Perform initial checks to verify the outline is correctly declared + if (!is_array($outline)) { + throw new Exception( + 'OPML outline element must be defined as an array' + ); + } + + if (empty($outline['text']) && $this->version !== '1.0') { + $this->throwExceptionIfStrict( + 'OPML outline text attribute is required' + ); + } + + if (isset($outline['type'])) { + $type = strtolower($outline['type']); + + if ($type === 'rss') { + if (empty($outline['xmlUrl'])) { + $this->throwExceptionIfStrict( + 'OPML outline xmlUrl attribute is required when type is "rss"' + ); + } elseif (!$this->checkHttpAddress($outline['xmlUrl'])) { + $this->throwExceptionIfStrict( + 'OPML outline xmlUrl attribute must be a HTTP address when type is "rss"' + ); + } + } elseif ($type === 'link' || $type === 'include') { + if (empty($outline['url'])) { + $this->throwExceptionIfStrict( + "OPML outline url attribute is required when type is \"{$type}\"" + ); + } elseif (!$this->checkHttpAddress($outline['url'])) { + $this->throwExceptionIfStrict( + "OPML outline url attribute must be a HTTP address when type is \"{$type}\"" + ); + } + } + } + + // Create the outline element and add it to the parent + $outline_element = new \DOMElement('outline'); + $parent_element->appendChild($outline_element); + + // Load the sub-outlines as child elements + if (isset($outline['@outlines'])) { + $outline_children = $outline['@outlines']; + + if (!is_array($outline_children)) { + throw new Exception( + 'OPML outline element must be defined as an array' + ); + } + + foreach ($outline_children as $outline_child) { + $this->renderOutline($outline_element, $outline_child); + } + + // We don't want the sub-outlines to be loaded as attributes, so we + // remove the key from the array. + unset($outline['@outlines']); + } + + // Load the other elements of the array as attributes + foreach ($outline as $name => $value) { + $namespace = $this->getNamespace($name); + + if ($name === 'created') { + if ($value instanceof \DateTimeInterface) { + $value = $value->format(\DateTimeInterface::RFC1123); + } else { + $this->throwExceptionIfStrict( + 'OPML outline created attribute must be a DateTime' + ); + } + } elseif ($name === 'isComment' || $name === 'isBreakpoint') { + if (is_bool($value)) { + $value = $value ? 'true' : 'false'; + } else { + $this->throwExceptionIfStrict( + "OPML outline {$name} attribute must be a boolean" + ); + } + } elseif (is_array($value)) { + $value = implode(', ', $value); + } + + $outline_element->setAttributeNS($namespace, $name, $value); + } + } + + /** + * Return wether a value is a valid HTTP address or not. + * + * HTTP address is not strictly defined by the OPML spec, so it is assumed: + * + * - it can be parsed by parse_url + * - it has a host part + * - scheme is http or https + * + * filter_var is not used because it would reject internationalized URLs + * (i.e. with non ASCII chars). An alternative would be to punycode such + * URLs, but it's more work to do it properly, and lib_opml needs to stay + * simple. + * + * @param string $value + * + * @return boolean + * Return true if the value is a valid HTTP address, false otherwise. + */ + public function checkHttpAddress($value) + { + $value = trim($value); + $parsed_url = parse_url($value); + if (!$parsed_url) { + return false; + } + + if ( + !isset($parsed_url['scheme']) || + !isset($parsed_url['host']) + ) { + return false; + } + + if ( + $parsed_url['scheme'] !== 'http' && + $parsed_url['scheme'] !== 'https' + ) { + return false; + } + + return true; + } + + /** + * Return the namespace of a qualified name. An empty string is returned if + * the name is not namespaced. + * + * @param string $qualified_name + * + * @throws \marienfressinaud\LibOpml\Exception + * Raised if the namespace prefix isn't declared. + * + * @return string + */ + private function getNamespace($qualified_name) + { + $split_name = explode(':', $qualified_name, 2); + // count will always be 1 or 2. + if (count($split_name) === 1) { + // If 1, there's no prefix, thus no namespace + return ''; + } else { + // If 2, it means it has a namespace prefix, so we get the + // namespace from the declared ones. + $namespace_prefix = $split_name[0]; + if (!isset($this->namespaces[$namespace_prefix])) { + throw new Exception( + "OPML namespace {$namespace_prefix} is not declared" + ); + } + + return $this->namespaces[$namespace_prefix]; + } + } + + /** + * Raise an exception only if strict is true. + * + * @param string $message + * + * @throws \marienfressinaud\LibOpml\Exception + */ + private function throwExceptionIfStrict($message) + { + if ($this->strict) { + throw new Exception($message); + } + } +} |
