From 6464666075170b006501c4f12d6a2f470300af46 Mon Sep 17 00:00:00 2001 From: Alexandre Alapetite Date: Thu, 7 Nov 2013 19:18:52 +0100 Subject: Plus de tolérance pour les flux MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Utilise une méthode moins stricte qui tolère des erreurs XML dans les flux. Le choix de l'encodage en entrée a dû être légèrement changé car SimplePie a une stratégie d'essayer plusieurs encodages jusqu'à en trouver un qui marche. En étant moins strict sur les erreurs, ça marche plus souvent, et du coup les encodages n'étaient plus bon. À essayer avec plein de flux. Tous mes flux passent (~150). Devrait permettre de fermer https://github.com/marienfressinaud/FreshRSS/issues/233 Fonctionne aussi avec des flux à encodage invalide comme http://travaux.ovh.net/rss.php qui se déclare en "text/xml" (du coup ASCII) mais dans le flux avec un entête XML déclarant de l'UTF-8 --- lib/SimplePie/SimplePie/Parser.php | 105 +++++++++++++++++++++---------------- 1 file changed, 61 insertions(+), 44 deletions(-) (limited to 'lib') diff --git a/lib/SimplePie/SimplePie/Parser.php b/lib/SimplePie/SimplePie/Parser.php index d698552ca..72878c25a 100644 --- a/lib/SimplePie/SimplePie/Parser.php +++ b/lib/SimplePie/SimplePie/Parser.php @@ -77,56 +77,73 @@ class SimplePie_Parser public function parse(&$data, $encoding) { - // Use UTF-8 if we get passed US-ASCII, as every US-ASCII character is a UTF-8 character - if (strtoupper($encoding) === 'US-ASCII') + if (!empty($encoding)) { - $this->encoding = 'UTF-8'; - } - else - { - $this->encoding = $encoding; - } - - // Strip BOM: - // UTF-32 Big Endian BOM - if (substr($data, 0, 4) === "\x00\x00\xFE\xFF") - { - $data = substr($data, 4); - } - // UTF-32 Little Endian BOM - elseif (substr($data, 0, 4) === "\xFF\xFE\x00\x00") - { - $data = substr($data, 4); - } - // UTF-16 Big Endian BOM - elseif (substr($data, 0, 2) === "\xFE\xFF") - { - $data = substr($data, 2); - } - // UTF-16 Little Endian BOM - elseif (substr($data, 0, 2) === "\xFF\xFE") - { - $data = substr($data, 2); - } - // UTF-8 BOM - elseif (substr($data, 0, 3) === "\xEF\xBB\xBF") - { - $data = substr($data, 3); - } - - if (substr($data, 0, 5) === '')) !== false) - { - $declaration = $this->registry->create('XML_Declaration_Parser', array(substr($data, 5, $pos - 5))); - if ($declaration->parse()) + // Use UTF-8 if we get passed US-ASCII, as every US-ASCII character is a UTF-8 character + if (strtoupper($encoding) === 'US-ASCII') { - $data = substr($data, $pos + 2); - $data = 'version . '" encoding="' . $encoding . '" standalone="' . (($declaration->standalone) ? 'yes' : 'no') . '"?>' . $data; + $this->encoding = 'UTF-8'; } else { - $this->error_string = 'SimplePie bug! Please report this!'; - return false; + $this->encoding = $encoding; + } + + // Strip BOM: + // UTF-32 Big Endian BOM + if (substr($data, 0, 4) === "\x00\x00\xFE\xFF") + { + $data = substr($data, 4); } + // UTF-32 Little Endian BOM + elseif (substr($data, 0, 4) === "\xFF\xFE\x00\x00") + { + $data = substr($data, 4); + } + // UTF-16 Big Endian BOM + elseif (substr($data, 0, 2) === "\xFE\xFF") + { + $data = substr($data, 2); + } + // UTF-16 Little Endian BOM + elseif (substr($data, 0, 2) === "\xFF\xFE") + { + $data = substr($data, 2); + } + // UTF-8 BOM + elseif (substr($data, 0, 3) === "\xEF\xBB\xBF") + { + $data = substr($data, 3); + } + + if (substr($data, 0, 5) === '')) !== false) + { + $declaration = $this->registry->create('XML_Declaration_Parser', array(substr($data, 5, $pos - 5))); + if ($declaration->parse()) + { + $data = substr($data, $pos + 2); + $data = 'version . '" encoding="' . $encoding . '" standalone="' . (($declaration->standalone) ? 'yes' : 'no') . '"?>' . $data; + } + else + { + $this->error_string = 'SimplePie bug! Please report this!'; + return false; + } + } + } + + try + { + $dom = new DOMDocument(); + $dom->recover = true; + $dom->strictErrorChecking = false; + $dom->loadXML($data); + $this->encoding = $encoding = $dom->encoding = 'UTF-8'; + $data = $dom->saveXML(); + //file_put_contents('/home/alex/public_html/alexandre.alapetite.fr/prive/FreshRSS/log/parser.log', date('c') . ' ' . 'OK' . "\n", FILE_APPEND); + } + catch (Exception $e) + { } $return = true; -- cgit v1.2.3