Source for file rss_parse.inc

Documentation is available at rss_parse.inc

  1. <?php
  2.  
  3. /**
  4. * Project: MagpieRSS: a simple RSS integration tool
  5. * File: rss_parse.inc - parse an RSS or Atom feed
  6. * return as a simple object.
  7. *
  8. * Handles RSS 0.9x, RSS 2.0, RSS 1.0, and Atom 0.3
  9. *
  10. * The lastest version of MagpieRSS can be obtained from:
  11. * http://magpierss.sourceforge.net
  12. *
  13. * For questions, help, comments, discussion, etc., please join the
  14. * Magpie mailing list:
  15. * magpierss-general@lists.sourceforge.net
  16. *
  17. * @author Kellan Elliott-McCrea <kellan@protest.net>
  18. * @version 0.7a
  19. * @license GPL
  20. *
  21. */
  22.  
  23. define('RSS', 'RSS');
  24. define('ATOM', 'Atom');
  25.  
  26. // Not needed on MidCOM
  27. // require_once (MAGPIE_DIR . 'rss_utils.inc');
  28.  
  29.  
  30.  
  31. /**
  32. * Hybrid parser, and object, takes RSS as a string and returns a simple object.
  33. *
  34. * see: rss_fetch.inc for a simpler interface with integrated caching support
  35. *
  36. */
  37. class MagpieRSS {
  38. var $parser;
  39. var $current_item = array(); // item currently being parsed
  40. var $items = array(); // collection of parsed items
  41. var $channel = array(); // hash of channel fields
  42. var $textinput = array();
  43. var $image = array();
  44. var $feed_type;
  45. var $feed_version;
  46. var $encoding = ''; // output encoding of parsed rss
  47. var $_source_encoding = ''; // only set if we have to parse xml prolog
  48. var $ERROR = "";
  49. var $WARNING = "";
  50. // define some constants
  51. var $_CONTENT_CONSTRUCTS = array('content', 'summary', 'info', 'title', 'tagline', 'copyright');
  52. var $_KNOWN_ENCODINGS = array('UTF-8', 'US-ASCII', 'ISO-8859-1');
  53.  
  54. // parser variables, useless if you're not a parser, treat as private
  55. var $stack = array(); // parser stack
  56. var $inchannel = false;
  57. var $initem = false;
  58. var $incontent = false; // if in Atom <content mode="xml"> field
  59. var $intextinput = false;
  60. var $inimage = false;
  61. var $current_field = '';
  62. var $current_namespace = false;
  63.  
  64. /**
  65. * Set up XML parser, parse source, and return populated RSS object..
  66. *
  67. * @param string $source string containing the RSS to be parsed
  68. *
  69. * NOTE: Probably a good idea to leave the encoding options alone unless
  70. * you know what you're doing as PHP's character set support is
  71. * a little weird.
  72. *
  73. * NOTE: A lot of this is unnecessary but harmless with PHP5
  74. *
  75. *
  76. * @param string $output_encoding output the parsed RSS in this character
  77. * set defaults to ISO-8859-1 as this is PHP's
  78. * default.
  79. *
  80. * NOTE: might be changed to UTF-8 in future
  81. * versions.
  82. *
  83. * @param string $input_encoding the character set of the incoming RSS source.
  84. * Leave blank and Magpie will try to figure it
  85. * out.
  86. *
  87. *
  88. * @param bool $detect_encoding if false Magpie won't attempt to detect
  89. * source encoding. (caveat emptor)
  90. *
  91. */
  92. function MagpieRSS ($source, $output_encoding='ISO-8859-1',
  93. $input_encoding=null, $detect_encoding=true)
  94. {
  95. # if PHP xml isn't compiled in, die
  96. #
  97. if (!function_exists('xml_parser_create')) {
  98. $this->error( "Failed to load PHP's XML Extension. " .
  99. "http://www.php.net/manual/en/ref.xml.php",
  100. E_USER_ERROR );
  101. }
  102. list($parser, $source) = $this->create_parser($source,
  103. $output_encoding, $input_encoding, $detect_encoding);
  104. if (!is_resource($parser)) {
  105. $this->error( "Failed to create an instance of PHP's XML parser. " .
  106. "http://www.php.net/manual/en/ref.xml.php",
  107. E_USER_ERROR );
  108. }
  109.  
  110. $this->parser = $parser;
  111. # pass in parser, and a reference to this object
  112. # setup handlers
  113. #
  114. xml_set_object( $this->parser, $this );
  115. xml_set_element_handler($this->parser,
  116. 'feed_start_element', 'feed_end_element' );
  117. xml_set_character_data_handler( $this->parser, 'feed_cdata' );
  118. $status = xml_parse( $this->parser, $source );
  119. if (! $status ) {
  120. $errorcode = xml_get_error_code( $this->parser );
  121. if ( $errorcode != XML_ERROR_NONE ) {
  122. $xml_error = xml_error_string( $errorcode );
  123. $error_line = xml_get_current_line_number($this->parser);
  124. $error_col = xml_get_current_column_number($this->parser);
  125. $errormsg = "$xml_error at line $error_line, column $error_col";
  126.  
  127. $this->error( $errormsg );
  128. }
  129. }
  130. xml_parser_free( $this->parser );
  131.  
  132. $this->normalize();
  133. }
  134. function feed_start_element($p, $element, &$attrs) {
  135. $el = $element = strtolower($element);
  136. $attrs = array_change_key_case($attrs, CASE_LOWER);
  137. // check for a namespace, and split if found
  138. $ns = false;
  139. if ( strpos( $element, ':' ) ) {
  140. list($ns, $el) = split( ':', $element, 2);
  141. }
  142. if ( $ns and $ns != 'rdf' ) {
  143. $this->current_namespace = $ns;
  144. }
  145. # if feed type isn't set, then this is first element of feed
  146. # identify feed from root element
  147. #
  148. if (!isset($this->feed_type) ) {
  149. if ( $el == 'rdf' ) {
  150. $this->feed_type = RSS;
  151. $this->feed_version = '1.0';
  152. }
  153. elseif ( $el == 'rss' ) {
  154. $this->feed_type = RSS;
  155. $this->feed_version = $attrs['version'];
  156. }
  157. elseif ( $el == 'feed' ) {
  158. $this->feed_type = ATOM;
  159. $this->feed_version = $attrs['version'];
  160. $this->inchannel = true;
  161. }
  162. return;
  163. }
  164. if ( $el == 'channel' )
  165. {
  166. $this->inchannel = true;
  167. }
  168. elseif ($el == 'item' or $el == 'entry' )
  169. {
  170. $this->initem = true;
  171. if ( isset($attrs['rdf:about']) ) {
  172. $this->current_item['about'] = $attrs['rdf:about'];
  173. }
  174. }
  175. // if we're in the default namespace of an RSS feed,
  176. // record textinput or image fields
  177. elseif (
  178. $this->feed_type == RSS and
  179. $this->current_namespace == '' and
  180. $el == 'textinput' )
  181. {
  182. $this->intextinput = true;
  183. }
  184. elseif (
  185. $this->feed_type == RSS and
  186. $this->current_namespace == '' and
  187. $el == 'image' )
  188. {
  189. $this->inimage = true;
  190. }
  191. # handle atom content constructs
  192. elseif ( $this->feed_type == ATOM and in_array($el, $this->_CONTENT_CONSTRUCTS) )
  193. {
  194. // avoid clashing w/ RSS mod_content
  195. if ($el == 'content' ) {
  196. $el = 'atom_content';
  197. }
  198. $this->incontent = $el;
  199. }
  200. // if inside an Atom content construct (e.g. content or summary) field treat tags as text
  201. elseif ($this->feed_type == ATOM and $this->incontent )
  202. {
  203. // if tags are inlined, then flatten
  204. $attrs_str = join(' ',
  205. array_map('map_attrs',
  206. array_keys($attrs),
  207. array_values($attrs) ) );
  208. $this->append_content( "<$element $attrs_str>" );
  209. array_unshift( $this->stack, $el );
  210. }
  211. // Atom support many links per containging element.
  212. // Magpie treats link elements of type rel='alternate'
  213. // as being equivalent to RSS's simple link element.
  214. //
  215. elseif ($this->feed_type == ATOM and $el == 'link' )
  216. {
  217. if ( isset($attrs['rel']) and $attrs['rel'] == 'alternate' )
  218. {
  219. $link_el = 'link';
  220. }
  221. else {
  222. $link_el = 'link_' . $attrs['rel'];
  223. }
  224. $this->append($link_el, $attrs['href']);
  225. }
  226. // set stack[0] to current element
  227. else {
  228. array_unshift($this->stack, $el);
  229. }
  230. }
  231.  
  232. function feed_cdata ($p, $text) {
  233. if ($this->feed_type == ATOM and $this->incontent)
  234. {
  235. $this->append_content( $text );
  236. }
  237. else {
  238. $current_el = join('_', array_reverse($this->stack));
  239. $this->append($current_el, $text);
  240. }
  241. }
  242. function feed_end_element ($p, $el) {
  243. $el = strtolower($el);
  244. if ( $el == 'item' or $el == 'entry' )
  245. {
  246. $this->items[] = $this->current_item;
  247. $this->current_item = array();
  248. $this->initem = false;
  249. }
  250. elseif ($this->feed_type == RSS and $this->current_namespace == '' and $el == 'textinput' )
  251. {
  252. $this->intextinput = false;
  253. }
  254. elseif ($this->feed_type == RSS and $this->current_namespace == '' and $el == 'image' )
  255. {
  256. $this->inimage = false;
  257. }
  258. elseif ($this->feed_type == ATOM and in_array($el, $this->_CONTENT_CONSTRUCTS) )
  259. {
  260. $this->incontent = false;
  261. }
  262. elseif ($el == 'channel' or $el == 'feed' )
  263. {
  264. $this->inchannel = false;
  265. }
  266. elseif ($this->feed_type == ATOM and $this->incontent ) {
  267. // balance tags properly
  268. // note: i don't think this is actually neccessary
  269. if ( $this->stack[0] == $el )
  270. {
  271. $this->append_content("</$el>");
  272. }
  273. else {
  274. $this->append_content("<$el />");
  275. }
  276.  
  277. array_shift( $this->stack );
  278. }
  279. else {
  280. array_shift( $this->stack );
  281. }
  282. $this->current_namespace = false;
  283. }
  284. function concat (&$str1, $str2="") {
  285. if (!isset($str1) ) {
  286. $str1="";
  287. }
  288. $str1 .= $str2;
  289. }
  290. function append_content($text) {
  291. if ( $this->initem ) {
  292. $this->concat( $this->current_item[ $this->incontent ], $text );
  293. }
  294. elseif ( $this->inchannel ) {
  295. $this->concat( $this->channel[ $this->incontent ], $text );
  296. }
  297. }
  298. // smart append - field and namespace aware
  299. function append($el, $text) {
  300. if (!$el) {
  301. return;
  302. }
  303. if ( $this->current_namespace )
  304. {
  305. if ( $this->initem ) {
  306. $this->concat(
  307. $this->current_item[ $this->current_namespace ][ $el ], $text);
  308. }
  309. elseif ($this->inchannel) {
  310. $this->concat(
  311. $this->channel[ $this->current_namespace][ $el ], $text );
  312. }
  313. elseif ($this->intextinput) {
  314. $this->concat(
  315. $this->textinput[ $this->current_namespace][ $el ], $text );
  316. }
  317. elseif ($this->inimage) {
  318. $this->concat(
  319. $this->image[ $this->current_namespace ][ $el ], $text );
  320. }
  321. }
  322. else {
  323. if ( $this->initem ) {
  324. $this->concat(
  325. $this->current_item[ $el ], $text);
  326. }
  327. elseif ($this->intextinput) {
  328. $this->concat(
  329. $this->textinput[ $el ], $text );
  330. }
  331. elseif ($this->inimage) {
  332. $this->concat(
  333. $this->image[ $el ], $text );
  334. }
  335. elseif ($this->inchannel) {
  336. $this->concat(
  337. $this->channel[ $el ], $text );
  338. }
  339. }
  340. }
  341. function normalize () {
  342. // if atom populate rss fields
  343. if ( $this->is_atom() ) {
  344. $this->channel['description'] = $this->channel['tagline'];
  345. for ( $i = 0; $i < count($this->items); $i++) {
  346. $item = $this->items[$i];
  347. if ( isset($item['summary']) )
  348. $item['description'] = $item['summary'];
  349. if ( isset($item['atom_content']))
  350. $item['content']['encoded'] = $item['atom_content'];
  351. $atom_date = (isset($item['issued']) ) ? $item['issued'] : $item['modified'];
  352. if ( $atom_date ) {
  353. $epoch = @parse_w3cdtf($item['modified']);
  354. if ($epoch and $epoch > 0) {
  355. $item['date_timestamp'] = $epoch;
  356. }
  357. }
  358. $this->items[$i] = $item;
  359. }
  360. }
  361. elseif ( $this->is_rss() ) {
  362. $this->channel['tagline'] = $this->channel['description'];
  363. for ( $i = 0; $i < count($this->items); $i++) {
  364. $item = $this->items[$i];
  365. if ( isset($item['description']))
  366. $item['summary'] = $item['description'];
  367. if ( isset($item['content']['encoded'] ) )
  368. $item['atom_content'] = $item['content']['encoded'];
  369. if ( $this->is_rss() == '1.0' and isset($item['dc']['date']) ) {
  370. $epoch = @parse_w3cdtf($item['dc']['date']);
  371. if ($epoch and $epoch > 0) {
  372. $item['date_timestamp'] = $epoch;
  373. }
  374. }
  375. elseif ( isset($item['pubdate']) ) {
  376. $epoch = @strtotime($item['pubdate']);
  377. if ($epoch > 0) {
  378. $item['date_timestamp'] = $epoch;
  379. }
  380. }
  381. $this->items[$i] = $item;
  382. }
  383. }
  384. }
  385. function is_rss () {
  386. if ( $this->feed_type == RSS ) {
  387. return $this->feed_version;
  388. }
  389. else {
  390. return false;
  391. }
  392. }
  393. function is_atom() {
  394. if ( $this->feed_type == ATOM ) {
  395. return $this->feed_version;
  396. }
  397. else {
  398. return false;
  399. }
  400. }
  401.  
  402. /**
  403. * return XML parser, and possibly re-encoded source
  404. *
  405. */
  406. function create_parser($source, $out_enc, $in_enc, $detect) {
  407. if ( substr(phpversion(),0,1) == 5) {
  408. $parser = $this->php5_create_parser($in_enc, $detect);
  409. }
  410. else {
  411. list($parser, $source) = $this->php4_create_parser($source, $in_enc, $detect);
  412. }
  413. if ($out_enc) {
  414. $this->encoding = $out_enc;
  415. @xml_parser_set_option($parser, XML_OPTION_TARGET_ENCODING, $out_enc);
  416. }
  417. return array($parser, $source);
  418. }
  419. /**
  420. * Instantiate an XML parser under PHP5
  421. *
  422. * PHP5 will do a fine job of detecting input encoding
  423. * if passed an empty string as the encoding.
  424. *
  425. * All hail libxml2!
  426. *
  427. */
  428. function php5_create_parser($in_enc, $detect) {
  429. // by default php5 does a fine job of detecting input encodings
  430. if(!$detect && $in_enc) {
  431. return xml_parser_create($in_enc);
  432. }
  433. else {
  434. return xml_parser_create('');
  435. }
  436. }
  437. /**
  438. * Instaniate an XML parser under PHP4
  439. *
  440. * Unfortunately PHP4's support for character encodings
  441. * and especially XML and character encodings sucks. As
  442. * long as the documents you parse only contain characters
  443. * from the ISO-8859-1 character set (a superset of ASCII,
  444. * and a subset of UTF-8) you're fine. However once you
  445. * step out of that comfy little world things get mad, bad,
  446. * and dangerous to know.
  447. *
  448. * The following code is based on SJM's work with FoF
  449. * @see http://minutillo.com/steve/weblog/2004/6/17/php-xml-and-character-encodings-a-tale-of-sadness-rage-and-data-loss
  450. *
  451. */
  452. function php4_create_parser($source, $in_enc, $detect) {
  453. if ( !$detect ) {
  454. return array(xml_parser_create($in_enc), $source);
  455. }
  456. if (!$in_enc) {
  457. if (preg_match('/<?xml.*encoding=[\'"](.*?)[\'"].*?>/m', $source, $m)) {
  458. $in_enc = strtoupper($m[1]);
  459. $this->source_encoding = $in_enc;
  460. }
  461. else {
  462. $in_enc = 'UTF-8';
  463. }
  464. }
  465. if ($this->known_encoding($in_enc)) {
  466. return array(xml_parser_create($in_enc), $source);
  467. }
  468. // the dectected encoding is not one of the simple encodings PHP knows
  469. // attempt to use the iconv extension to
  470. // cast the XML to a known encoding
  471. // @see http://php.net/iconv
  472. if (function_exists('iconv')) {
  473. $encoded_source = iconv($in_enc,'UTF-8', $source);
  474. if ($encoded_source) {
  475. return array(xml_parser_create('UTF-8'), $encoded_source);
  476. }
  477. }
  478. // iconv didn't work, try mb_convert_encoding
  479. // @see http://php.net/mbstring
  480. if(function_exists('mb_convert_encoding')) {
  481. $encoded_source = mb_convert_encoding($source, 'UTF-8', $in_enc );
  482. if ($encoded_source) {
  483. return array(xml_parser_create('UTF-8'), $encoded_source);
  484. }
  485. }
  486. // else
  487. $this->error("Feed is in an unsupported character encoding. ($in_enc) " .
  488. "You may see strange artifacts, and mangled characters.",
  489. E_USER_NOTICE);
  490. return array(xml_parser_create(), $source);
  491. }
  492. function known_encoding($enc) {
  493. $enc = strtoupper($enc);
  494. if ( in_array($enc, $this->_KNOWN_ENCODINGS) ) {
  495. return $enc;
  496. }
  497. else {
  498. return false;
  499. }
  500. }
  501.  
  502. function error ($errormsg, $lvl=E_USER_WARNING) {
  503. // append PHP's error message if track_errors enabled
  504. if ( $php_errormsg ) {
  505. $errormsg .= " ($php_errormsg)";
  506. }
  507. if ( MAGPIE_DEBUG ) {
  508. trigger_error( $errormsg, $lvl);
  509. }
  510. else {
  511. error_log( $errormsg, 0);
  512. }
  513. $notices = E_USER_NOTICE|E_NOTICE;
  514. if ( $lvl&$notices ) {
  515. $this->WARNING = $errormsg;
  516. } else {
  517. $this->ERROR = $errormsg;
  518. }
  519. }
  520. } // end class RSS
  521.  
  522. function map_attrs($k, $v) {
  523. return "$k=\"$v\"";
  524. }
  525.  
  526.  
  527. ?>

Documentation generated on Mon, 21 Nov 2005 18:21:41 +0100 by phpDocumentor 1.3.0RC3