From 403923d3d12ed8316fd56ab758547b0352fb0ceb Mon Sep 17 00:00:00 2001 From: Marien Fressinaud Date: Wed, 7 Nov 2012 22:30:21 +0100 Subject: Changement de library pour parser les sites dont on doit récupérer le contenu MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lib/lib_rss.php | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) (limited to 'lib/lib_rss.php') diff --git a/lib/lib_rss.php b/lib/lib_rss.php index f0a65fa49..e957a11fb 100644 --- a/lib/lib_rss.php +++ b/lib/lib_rss.php @@ -181,28 +181,24 @@ function get_path ($url) { } -/* supprime les trucs inutiles des balises html */ -function good_bye_extra ($element) { - $element->style = null; - $element->class = null; - $element->id = null; - $element->onload = null; -} /* permet de récupérer le contenu d'un article pour un flux qui n'est pas complet */ function get_content_by_parsing ($url, $path) { - $html = new simple_html_dom (); - $html->set_callback ('good_bye_extra'); - $ok = $html->load_file ($url); + $html = file_get_contents ($url); - if ($ok !== false) { - $content = $html->find ($path, 0); - $html->clear (); - - if ($content) { - return $content->__toString (); - } else { - throw new Exception (); - } + if ($html) { + $doc = phpQuery::newDocument ($html); + $content = $doc->find ($path); + $content->find ('*')->removeAttr ('style') + ->removeAttr ('id') + ->removeAttr ('class') + ->removeAttr ('onload') + ->removeAttr ('target'); + $content->removeAttr ('style') + ->removeAttr ('id') + ->removeAttr ('class') + ->removeAttr ('onload') + ->removeAttr ('target'); + return $content->__toString (); } else { throw new Exception (); } -- cgit v1.2.3