Source for file rss_fetch.inc

Documentation is available at rss_fetch.inc

  1. <?php
  2. /*
  3. * Project: MagpieRSS: a simple RSS integration tool
  4. * File: rss_fetch.inc, a simple functional interface
  5. to fetching and parsing RSS files, via the
  6. function fetch_rss()
  7. * Author: Kellan Elliott-McCrea <kellan@protest.net>
  8. * License: GPL
  9. *
  10. * The lastest version of MagpieRSS can be obtained from:
  11. * http://magpierss.sourceforge.net
  12. *
  13. * For questions, help, comments, discussion, etc., please join the
  14. * Magpie mailing list:
  15. * magpierss-general@lists.sourceforge.net
  16. *
  17. */
  18.  
  19. /* Not needed on MidCOM
  20. // Setup MAGPIE_DIR for use on hosts that don't include
  21. // the current path in include_path.
  22. // with thanks to rajiv and smarty
  23. if (!defined('DIR_SEP')) {
  24. define('DIR_SEP', DIRECTORY_SEPARATOR);
  25. }
  26.  
  27. if (!defined('MAGPIE_DIR')) {
  28. define('MAGPIE_DIR', dirname(__FILE__) . DIR_SEP);
  29. }
  30.  
  31. require_once( MAGPIE_DIR . 'rss_parse.inc' );
  32. require_once( MAGPIE_DIR . 'rss_cache.inc' );
  33.  
  34. // for including 3rd party libraries
  35. define('MAGPIE_EXTLIB', MAGPIE_DIR . 'extlib' . DIR_SEP);
  36. require_once( MAGPIE_EXTLIB . 'Snoopy.class.inc');
  37. */
  38.  
  39. /*
  40. * CONSTANTS - redefine these in your script to change the
  41. * behaviour of fetch_rss() currently, most options effect the cache
  42. *
  43. * MAGPIE_CACHE_ON - Should Magpie cache parsed RSS objects?
  44. * For me a built in cache was essential to creating a "PHP-like"
  45. * feel to Magpie, see rss_cache.inc for rationale
  46. *
  47. *
  48. * MAGPIE_CACHE_DIR - Where should Magpie cache parsed RSS objects?
  49. * This should be a location that the webserver can write to. If this
  50. * directory does not already exist Mapie will try to be smart and create
  51. * it. This will often fail for permissions reasons.
  52. *
  53. *
  54. * MAGPIE_CACHE_AGE - How long to store cached RSS objects? In seconds.
  55. *
  56. *
  57. * MAGPIE_CACHE_FRESH_ONLY - If remote fetch fails, throw error
  58. * instead of returning stale object?
  59. *
  60. * MAGPIE_DEBUG - Display debugging notices?
  61. *
  62. */
  63.  
  64.  
  65. /*=======================================================================*\
  66. Function: fetch_rss:
  67. Purpose: return RSS object for the give url
  68. maintain the cache
  69. Input: url of RSS file
  70. Output: parsed RSS object (see rss_parse.inc)
  71.  
  72. NOTES ON CACHEING:
  73. If caching is on (MAGPIE_CACHE_ON) fetch_rss will first check the cache.
  74. NOTES ON RETRIEVING REMOTE FILES:
  75. If conditional gets are on (MAGPIE_CONDITIONAL_GET_ON) fetch_rss will
  76. return a cached object, and touch the cache object upon recieving a
  77. 304.
  78. NOTES ON FAILED REQUESTS:
  79. If there is an HTTP error while fetching an RSS object, the cached
  80. version will be return, if it exists (and if MAGPIE_CACHE_FRESH_ONLY is off)
  81. \*=======================================================================*/
  82.  
  83. define('MAGPIE_VERSION', '0.7');
  84.  
  85. $MAGPIE_ERROR = "";
  86.  
  87. function fetch_rss ($url) {
  88. // initialize constants
  89. init();
  90. if ( !isset($url) ) {
  91. error("fetch_rss called without a url");
  92. return false;
  93. }
  94. // if cache is disabled
  95. if ( !MAGPIE_CACHE_ON ) {
  96. // fetch file, and parse it
  97. $resp = _fetch_remote_file( $url );
  98. if ( is_success( $resp->status ) ) {
  99. return _response_to_rss( $resp );
  100. }
  101. else {
  102. error("Failed to fetch $url and cache is off");
  103. return false;
  104. }
  105. }
  106. // else cache is ON
  107. else {
  108. // Flow
  109. // 1. check cache
  110. // 2. if there is a hit, make sure its fresh
  111. // 3. if cached obj fails freshness check, fetch remote
  112. // 4. if remote fails, return stale object, or error
  113. $cache = new RSSCache( MAGPIE_CACHE_DIR, MAGPIE_CACHE_AGE );
  114. if (MAGPIE_DEBUG and $cache->ERROR) {
  115. debug($cache->ERROR, E_USER_WARNING);
  116. }
  117. $cache_status = 0; // response of check_cache
  118. $request_headers = array(); // HTTP headers to send with fetch
  119. $rss = 0; // parsed RSS object
  120. $errormsg = 0; // errors, if any
  121. // store parsed XML by desired output encoding
  122. // as character munging happens at parse time
  123. $cache_key = $url . MAGPIE_OUTPUT_ENCODING;
  124. if (!$cache->ERROR) {
  125. // return cache HIT, MISS, or STALE
  126. $cache_status = $cache->check_cache( $cache_key);
  127. }
  128. // if object cached, and cache is fresh, return cached obj
  129. if ( $cache_status == 'HIT' ) {
  130. $rss = $cache->get( $cache_key );
  131. if ( isset($rss) and $rss ) {
  132. $rss->from_cache = 1;
  133. if ( MAGPIE_DEBUG > 1) {
  134. debug("MagpieRSS: Cache HIT", E_USER_NOTICE);
  135. }
  136. return $rss;
  137. }
  138. }
  139. // else attempt a conditional get
  140. // setup headers
  141. if ( $cache_status == 'STALE' ) {
  142. $rss = $cache->get( $url );
  143. if ( $rss->etag and $rss->last_modified ) {
  144. $request_headers['If-None-Match'] = $rss->etag;
  145. $request_headers['If-Last-Modified'] = $rss->last_modified;
  146. }
  147. }
  148. $resp = _fetch_remote_file( $url, $request_headers );
  149. if (isset($resp) and $resp) {
  150. if ($resp->status == '304' ) {
  151. // we have the most current copy
  152. if ( MAGPIE_DEBUG > 1) {
  153. debug("Got 304 for $url");
  154. }
  155. // reset cache on 304 (at minutillo insistent prodding)
  156. $cache->set($cache_key, $rss);
  157. return $rss;
  158. }
  159. elseif ( is_success( $resp->status ) ) {
  160. $rss = _response_to_rss( $resp );
  161. if ( $rss ) {
  162. if (MAGPIE_DEBUG > 1) {
  163. debug("Fetch successful");
  164. }
  165. // add object to cache
  166. $cache->set( $cache_key, $rss );
  167. return $rss;
  168. }
  169. }
  170. else {
  171. $errormsg = "Failed to fetch $url. ";
  172. if ( $resp->status == '-100' ) {
  173. $errormsg .= "(Request timed out after " . MAGPIE_FETCH_TIME_OUT . " seconds)";
  174. }
  175. elseif ( $resp->error ) {
  176. # compensate for Snoopy's annoying habbit to tacking
  177. # on '\n'
  178. $http_error = substr($resp->error, 0, -2);
  179. $errormsg .= "(HTTP Error: $http_error)";
  180. }
  181. else {
  182. $errormsg .= "(HTTP Response: " . $resp->response_code .')';
  183. }
  184. }
  185. }
  186. else {
  187. $errormsg = "Unable to retrieve RSS file for unknown reasons.";
  188. }
  189. // else fetch failed
  190. // attempt to return cached object
  191. if ($rss) {
  192. if ( MAGPIE_DEBUG ) {
  193. debug("Returning STALE object for $url");
  194. }
  195. return $rss;
  196. }
  197. // else we totally failed
  198. error( $errormsg );
  199. return false;
  200. } // end if ( !MAGPIE_CACHE_ON ) {
  201. } // end fetch_rss()
  202. /*=======================================================================*\
  203. Function: error
  204. Purpose: set MAGPIE_ERROR, and trigger error
  205. \*=======================================================================*/
  206.  
  207. function error ($errormsg, $lvl=E_USER_WARNING) {
  208. global $MAGPIE_ERROR;
  209. // append PHP's error message if track_errors enabled
  210. if ( isset($php_errormsg) ) {
  211. $errormsg .= " ($php_errormsg)";
  212. }
  213. if ( $errormsg ) {
  214. $errormsg = "MagpieRSS: $errormsg";
  215. $MAGPIE_ERROR = $errormsg;
  216. trigger_error( $errormsg, $lvl);
  217. }
  218. }
  219.  
  220. function debug ($debugmsg, $lvl=E_USER_NOTICE) {
  221. trigger_error("MagpieRSS [debug] $debugmsg", $lvl);
  222. }
  223. /*=======================================================================*\
  224. Function: magpie_error
  225. Purpose: accessor for the magpie error variable
  226. \*=======================================================================*/
  227. function magpie_error ($errormsg="") {
  228. global $MAGPIE_ERROR;
  229. if ( isset($errormsg) and $errormsg ) {
  230. $MAGPIE_ERROR = $errormsg;
  231. }
  232. return $MAGPIE_ERROR;
  233. }
  234.  
  235. /*=======================================================================*\
  236. Function: _fetch_remote_file
  237. Purpose: retrieve an arbitrary remote file
  238. Input: url of the remote file
  239. headers to send along with the request (optional)
  240. Output: an HTTP response object (see Snoopy.class.inc)
  241. \*=======================================================================*/
  242. function _fetch_remote_file ($url, $headers = "" ) {
  243. // Snoopy is an HTTP client in PHP
  244. $client = new Snoopy();
  245. $client->agent = MAGPIE_USER_AGENT;
  246. $client->read_timeout = MAGPIE_FETCH_TIME_OUT;
  247. $client->use_gzip = MAGPIE_USE_GZIP;
  248. if (is_array($headers) ) {
  249. $client->rawheaders = $headers;
  250. }
  251. @$client->fetch($url);
  252. return $client;
  253.  
  254. }
  255.  
  256. /*=======================================================================*\
  257. Function: _response_to_rss
  258. Purpose: parse an HTTP response object into an RSS object
  259. Input: an HTTP response object (see Snoopy)
  260. Output: parsed RSS object (see rss_parse)
  261. \*=======================================================================*/
  262. function _response_to_rss ($resp) {
  263. $rss = new MagpieRSS( $resp->results, MAGPIE_OUTPUT_ENCODING, MAGPIE_INPUT_ENCODING, MAGPIE_DETECT_ENCODING );
  264. // if RSS parsed successfully
  265. if ( $rss and !$rss->ERROR) {
  266. // find Etag, and Last-Modified
  267. foreach($resp->headers as $h) {
  268. // 2003-03-02 - Nicola Asuni (www.tecnick.com) - fixed bug "Undefined offset: 1"
  269. if (strpos($h, ": ")) {
  270. list($field, $val) = explode(": ", $h, 2);
  271. }
  272. else {
  273. $field = $h;
  274. $val = "";
  275. }
  276. if ( $field == 'ETag' ) {
  277. $rss->etag = $val;
  278. }
  279. if ( $field == 'Last-Modified' ) {
  280. $rss->last_modified = $val;
  281. }
  282. }
  283. return $rss;
  284. } // else construct error message
  285. else {
  286. $errormsg = "Failed to parse RSS file.";
  287. if ($rss) {
  288. $errormsg .= " (" . $rss->ERROR . ")";
  289. }
  290. error($errormsg);
  291. return false;
  292. } // end if ($rss and !$rss->error)
  293. }
  294.  
  295. /*=======================================================================*\
  296. Function: init
  297. Purpose: setup constants with default values
  298. check for user overrides
  299. \*=======================================================================*/
  300. function init () {
  301. if ( defined('MAGPIE_INITALIZED') ) {
  302. return;
  303. }
  304. else {
  305. define('MAGPIE_INITALIZED', true);
  306. }
  307. if ( !defined('MAGPIE_CACHE_ON') ) {
  308. define('MAGPIE_CACHE_ON', true);
  309. }
  310.  
  311. if ( !defined('MAGPIE_CACHE_DIR') ) {
  312. define('MAGPIE_CACHE_DIR', './cache');
  313. }
  314.  
  315. if ( !defined('MAGPIE_CACHE_AGE') ) {
  316. define('MAGPIE_CACHE_AGE', 60*60); // one hour
  317. }
  318.  
  319. if ( !defined('MAGPIE_CACHE_FRESH_ONLY') ) {
  320. define('MAGPIE_CACHE_FRESH_ONLY', false);
  321. }
  322.  
  323. if ( !defined('MAGPIE_OUTPUT_ENCODING') ) {
  324. define('MAGPIE_OUTPUT_ENCODING', 'ISO-8859-1');
  325. }
  326. if ( !defined('MAGPIE_INPUT_ENCODING') ) {
  327. define('MAGPIE_INPUT_ENCODING', null);
  328. }
  329. if ( !defined('MAGPIE_DETECT_ENCODING') ) {
  330. define('MAGPIE_DETECT_ENCODING', true);
  331. }
  332. if ( !defined('MAGPIE_DEBUG') ) {
  333. define('MAGPIE_DEBUG', 0);
  334. }
  335. if ( !defined('MAGPIE_USER_AGENT') ) {
  336. $ua = 'MagpieRSS/'. MAGPIE_VERSION . ' (+http://magpierss.sf.net';
  337. if ( MAGPIE_CACHE_ON ) {
  338. $ua = $ua . ')';
  339. }
  340. else {
  341. $ua = $ua . '; No cache)';
  342. }
  343. define('MAGPIE_USER_AGENT', $ua);
  344. }
  345. if ( !defined('MAGPIE_FETCH_TIME_OUT') ) {
  346. define('MAGPIE_FETCH_TIME_OUT', 5); // 5 second timeout
  347. }
  348. // use gzip encoding to fetch rss files if supported?
  349. if ( !defined('MAGPIE_USE_GZIP') ) {
  350. define('MAGPIE_USE_GZIP', true);
  351. }
  352. }
  353.  
  354. // NOTE: the following code should really be in Snoopy, or at least
  355. // somewhere other then rss_fetch!
  356.  
  357. /*=======================================================================*\
  358. HTTP STATUS CODE PREDICATES
  359. These functions attempt to classify an HTTP status code
  360. based on RFC 2616 and RFC 2518.
  361. All of them take an HTTP status code as input, and return true or false
  362.  
  363. All this code is adapted from LWP's HTTP::Status.
  364. \*=======================================================================*/
  365.  
  366.  
  367. /*=======================================================================*\
  368. Function: is_info
  369. Purpose: return true if Informational status code
  370. \*=======================================================================*/
  371. function is_info ($sc) {
  372. return $sc >= 100 && $sc < 200;
  373. }
  374.  
  375. /*=======================================================================*\
  376. Function: is_success
  377. Purpose: return true if Successful status code
  378. \*=======================================================================*/
  379. function is_success ($sc) {
  380. return $sc >= 200 && $sc < 300;
  381. }
  382.  
  383. /*=======================================================================*\
  384. Function: is_redirect
  385. Purpose: return true if Redirection status code
  386. \*=======================================================================*/
  387. function is_redirect ($sc) {
  388. return $sc >= 300 && $sc < 400;
  389. }
  390.  
  391. /*=======================================================================*\
  392. Function: is_error
  393. Purpose: return true if Error status code
  394. \*=======================================================================*/
  395. function is_error ($sc) {
  396. return $sc >= 400 && $sc < 600;
  397. }
  398.  
  399. /*=======================================================================*\
  400. Function: is_client_error
  401. Purpose: return true if Error status code, and its a client error
  402. \*=======================================================================*/
  403. function is_client_error ($sc) {
  404. return $sc >= 400 && $sc < 500;
  405. }
  406.  
  407. /*=======================================================================*\
  408. Function: is_client_error
  409. Purpose: return true if Error status code, and its a server error
  410. \*=======================================================================*/
  411. function is_server_error ($sc) {
  412. return $sc >= 500 && $sc < 600;
  413. }
  414.  
  415. ?>

Documentation generated on Mon, 21 Nov 2005 18:21:37 +0100 by phpDocumentor 1.3.0RC3