Source for file Snoopy.class.inc

Documentation is available at Snoopy.class.inc

  1. <?php
  2.  
  3. /***********************************************
  4. Snoopy - the PHP net client
  5. Author: Monte Ohrt <monte@ispi.net>
  6. Copyright (c): 1999-2000 ispi, all rights reserved
  7. Version: 1.0
  8. * This library is free software; you can redistribute it and/or
  9. * modify it under the terms of the GNU Lesser General Public
  10. * License as published by the Free Software Foundation; either
  11. * version 2.1 of the License, or (at your option) any later version.
  12. *
  13. * This library is distributed in the hope that it will be useful,
  14. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. * Lesser General Public License for more details.
  17. *
  18. * You should have received a copy of the GNU Lesser General Public
  19. * License along with this library; if not, write to the Free Software
  20. * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  21. You may contact the author of Snoopy by e-mail at:
  22. monte@ispi.net
  23. Or, write to:
  24. Monte Ohrt
  25. CTO, ispi
  26. 237 S. 70th suite 220
  27. Lincoln, NE 68510
  28. The latest version of Snoopy can be obtained from:
  29. http://snoopy.sourceforge.com
  30. *************************************************/
  31.  
  32.  
  33. class Snoopy
  34. {
  35. /**** Public variables ****/
  36.  
  37. /* user definable vars */
  38.  
  39. var $host = "www.php.net"; // host name we are connecting to
  40. var $port = 80; // port we are connecting to
  41. var $proxy_host = ""; // proxy host to use
  42. var $proxy_port = ""; // proxy port to use
  43. var $agent = "Snoopy v1.0"; // agent we masquerade as
  44. var $referer = ""; // referer info to pass
  45. var $cookies = array(); // array of cookies to pass
  46. // $cookies["username"]="joe";
  47. var $rawheaders = array(); // array of raw headers to send
  48. // $rawheaders["Content-type"]="text/html";
  49.  
  50.  
  51. var $maxredirs = 5; // http redirection depth maximum. 0 = disallow
  52. var $lastredirectaddr = ""; // contains address of last redirected address
  53. var $offsiteok = true; // allows redirection off-site
  54. var $maxframes = 0; // frame content depth maximum. 0 = disallow
  55. var $expandlinks = true; // expand links to fully qualified URLs.
  56. // this only applies to fetchlinks()
  57. // or submitlinks()
  58. var $passcookies = true; // pass set cookies back through redirects
  59. // NOTE: this currently does not respect
  60. // dates, domains or paths.
  61. var $user = ""; // user for http authentication
  62. var $pass = ""; // password for http authentication
  63. // http accept types
  64. var $accept = "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, */*";
  65. var $results = ""; // where the content is put
  66. var $error = ""; // error messages sent here
  67. var $response_code = ""; // response code returned from server
  68. var $headers = array(); // headers returned from server sent here
  69. var $maxlength = 500000; // max return data length (body)
  70. var $read_timeout = 0; // timeout on read operations, in seconds
  71. // supported only since PHP 4 Beta 4
  72. // set to 0 to disallow timeouts
  73. var $timed_out = false; // if a read operation timed out
  74. var $status = 0; // http request status
  75. var $curl_path = "/usr/bin/curl";
  76. // Snoopy will use cURL for fetching
  77. // SSL content if a full system path to
  78. // the cURL binary is supplied here.
  79. // set to false if you do not have
  80. // cURL installed. See http://curl.haxx.se
  81. // for details on installing cURL.
  82. // Snoopy does *not* use the cURL
  83. // library functions built into php,
  84. // as these functions are not stable
  85. // as of this Snoopy release.
  86. // send Accept-encoding: gzip?
  87. var $use_gzip = true;
  88. /**** Private variables ****/
  89. var $_maxlinelen = 4096; // max line length (headers)
  90. var $_httpmethod = "GET"; // default http request method
  91. var $_httpversion = "HTTP/1.0"; // default http request version
  92. var $_submit_method = "POST"; // default submit method
  93. var $_submit_type = "application/x-www-form-urlencoded"; // default submit type
  94. var $_mime_boundary = ""; // MIME boundary for multipart/form-data submit type
  95. var $_redirectaddr = false; // will be set if page fetched is a redirect
  96. var $_redirectdepth = 0; // increments on an http redirect
  97. var $_frameurls = array(); // frame src urls
  98. var $_framedepth = 0; // increments on frame depth
  99. var $_isproxy = false; // set if using a proxy server
  100. var $_fp_timeout = 30; // timeout for socket connection
  101.  
  102. /*======================================================================*\
  103. Function: fetch
  104. Purpose: fetch the contents of a web page
  105. (and possibly other protocols in the
  106. future like ftp, nntp, gopher, etc.)
  107. Input: $URI the location of the page to fetch
  108. Output: $this->results the output text from the fetch
  109. \*======================================================================*/
  110.  
  111. function fetch($URI)
  112. {
  113. //preg_match("|^([^:]+)://([^:/]+)(:[\d]+)*(.*)|",$URI,$URI_PARTS);
  114. $URI_PARTS = parse_url($URI);
  115. if (!empty($URI_PARTS["user"]))
  116. $this->user = $URI_PARTS["user"];
  117. if (!empty($URI_PARTS["pass"]))
  118. $this->pass = $URI_PARTS["pass"];
  119. switch($URI_PARTS["scheme"])
  120. {
  121. case "http":
  122. $this->host = $URI_PARTS["host"];
  123. if(!empty($URI_PARTS["port"]))
  124. $this->port = $URI_PARTS["port"];
  125. if($this->_connect($fp))
  126. {
  127. if($this->_isproxy)
  128. {
  129. // using proxy, send entire URI
  130. $this->_httprequest($URI,$fp,$URI,$this->_httpmethod);
  131. }
  132. else
  133. {
  134. $path = $URI_PARTS["path"].(isset($URI_PARTS["query"]) ? "?".$URI_PARTS["query"] : "");
  135. // no proxy, send only the path
  136. $this->_httprequest($path, $fp, $URI, $this->_httpmethod);
  137. }
  138. $this->_disconnect($fp);
  139.  
  140. if($this->_redirectaddr)
  141. {
  142. /* url was redirected, check if we've hit the max depth */
  143. if($this->maxredirs > $this->_redirectdepth)
  144. {
  145. // only follow redirect if it's on this site, or offsiteok is true
  146. if(preg_match("|^http://".preg_quote($this->host)."|i",$this->_redirectaddr) || $this->offsiteok)
  147. {
  148. /* follow the redirect */
  149. $this->_redirectdepth++;
  150. $this->lastredirectaddr=$this->_redirectaddr;
  151. $this->fetch($this->_redirectaddr);
  152. }
  153. }
  154. }
  155.  
  156. if($this->_framedepth < $this->maxframes && count($this->_frameurls) > 0)
  157. {
  158. $frameurls = $this->_frameurls;
  159. $this->_frameurls = array();
  160. while(list(,$frameurl) = each($frameurls))
  161. {
  162. if($this->_framedepth < $this->maxframes)
  163. {
  164. $this->fetch($frameurl);
  165. $this->_framedepth++;
  166. }
  167. else
  168. break;
  169. }
  170. }
  171. }
  172. else
  173. {
  174. return false;
  175. }
  176. return true;
  177. break;
  178. case "https":
  179. if(!$this->curl_path || (!is_executable($this->curl_path))) {
  180. $this->error = "Bad curl ($this->curl_path), can't fetch HTTPS \n";
  181. return false;
  182. }
  183. $this->host = $URI_PARTS["host"];
  184. if(!empty($URI_PARTS["port"]))
  185. $this->port = $URI_PARTS["port"];
  186. if($this->_isproxy)
  187. {
  188. // using proxy, send entire URI
  189. $this->_httpsrequest($URI,$URI,$this->_httpmethod);
  190. }
  191. else
  192. {
  193. $path = $URI_PARTS["path"].($URI_PARTS["query"] ? "?".$URI_PARTS["query"] : "");
  194. // no proxy, send only the path
  195. $this->_httpsrequest($path, $URI, $this->_httpmethod);
  196. }
  197.  
  198. if($this->_redirectaddr)
  199. {
  200. /* url was redirected, check if we've hit the max depth */
  201. if($this->maxredirs > $this->_redirectdepth)
  202. {
  203. // only follow redirect if it's on this site, or offsiteok is true
  204. if(preg_match("|^http://".preg_quote($this->host)."|i",$this->_redirectaddr) || $this->offsiteok)
  205. {
  206. /* follow the redirect */
  207. $this->_redirectdepth++;
  208. $this->lastredirectaddr=$this->_redirectaddr;
  209. $this->fetch($this->_redirectaddr);
  210. }
  211. }
  212. }
  213.  
  214. if($this->_framedepth < $this->maxframes && count($this->_frameurls) > 0)
  215. {
  216. $frameurls = $this->_frameurls;
  217. $this->_frameurls = array();
  218.  
  219. while(list(,$frameurl) = each($frameurls))
  220. {
  221. if($this->_framedepth < $this->maxframes)
  222. {
  223. $this->fetch($frameurl);
  224. $this->_framedepth++;
  225. }
  226. else
  227. break;
  228. }
  229. }
  230. return true;
  231. break;
  232. default:
  233. // not a valid protocol
  234. $this->error = 'Invalid protocol "'.$URI_PARTS["scheme"].'"\n';
  235. return false;
  236. break;
  237. }
  238. return true;
  239. }
  240.  
  241.  
  242.  
  243. /*======================================================================*\
  244. Private functions
  245. \*======================================================================*/
  246. /*======================================================================*\
  247. Function: _striplinks
  248. Purpose: strip the hyperlinks from an html document
  249. Input: $document document to strip.
  250. Output: $match an array of the links
  251. \*======================================================================*/
  252.  
  253. function _striplinks($document)
  254. {
  255. preg_match_all("'<\s*a\s+.*href\s*=\s* # find <a href=
  256. ([\"\'])? # find single or double quote
  257. (?(1) (.*?)\\1 | ([^\s\>]+)) # if quote found, match up to next matching
  258. # quote, otherwise match up to next space
  259. 'isx",$document,$links);
  260.  
  261. // catenate the non-empty matches from the conditional subpattern
  262.  
  263. while(list($key,$val) = each($links[2]))
  264. {
  265. if(!empty($val))
  266. $match[] = $val;
  267. }
  268. while(list($key,$val) = each($links[3]))
  269. {
  270. if(!empty($val))
  271. $match[] = $val;
  272. }
  273. // return the links
  274. return $match;
  275. }
  276.  
  277. /*======================================================================*\
  278. Function: _stripform
  279. Purpose: strip the form elements from an html document
  280. Input: $document document to strip.
  281. Output: $match an array of the links
  282. \*======================================================================*/
  283.  
  284. function _stripform($document)
  285. {
  286. preg_match_all("'<\/?(FORM|INPUT|SELECT|TEXTAREA|(OPTION))[^<>]*>(?(2)(.*(?=<\/?(option|select)[^<>]*>[\r\n]*)|(?=[\r\n]*))|(?=[\r\n]*))'Usi",$document,$elements);
  287. // catenate the matches
  288. $match = implode("\r\n",$elements[0]);
  289. // return the links
  290. return $match;
  291. }
  292.  
  293. /*======================================================================*\
  294. Function: _striptext
  295. Purpose: strip the text from an html document
  296. Input: $document document to strip.
  297. Output: $text the resulting text
  298. \*======================================================================*/
  299.  
  300. function _striptext($document)
  301. {
  302. // I didn't use preg eval (//e) since that is only available in PHP 4.0.
  303. // so, list your entities one by one here. I included some of the
  304. // more common ones.
  305. $search = array("'<script[^>]*?>.*?</script>'si", // strip out javascript
  306. "'<[\/\!]*?[^<>]*?>'si", // strip out html tags
  307. "'([\r\n])[\s]+'", // strip out white space
  308. "'&(quote|#34);'i", // replace html entities
  309. "'&(amp|#38);'i",
  310. "'&(lt|#60);'i",
  311. "'&(gt|#62);'i",
  312. "'&(nbsp|#160);'i",
  313. "'&(iexcl|#161);'i",
  314. "'&(cent|#162);'i",
  315. "'&(pound|#163);'i",
  316. "'&(copy|#169);'i"
  317. );
  318. $replace = array( "",
  319. "",
  320. "\\1",
  321. "\"",
  322. "&",
  323. "<",
  324. ">",
  325. " ",
  326. chr(161),
  327. chr(162),
  328. chr(163),
  329. chr(169));
  330. $text = preg_replace($search,$replace,$document);
  331. return $text;
  332. }
  333.  
  334. /*======================================================================*\
  335. Function: _expandlinks
  336. Purpose: expand each link into a fully qualified URL
  337. Input: $links the links to qualify
  338. $URI the full URI to get the base from
  339. Output: $expandedLinks the expanded links
  340. \*======================================================================*/
  341.  
  342. function _expandlinks($links,$URI)
  343. {
  344. preg_match("/^[^\?]+/",$URI,$match);
  345.  
  346. $match = preg_replace("|/[^\/\.]+\.[^\/\.]+$|","",$match[0]);
  347. $search = array( "|^http://".preg_quote($this->host)."|i",
  348. "|^(?!http://)(\/)?(?!mailto:)|i",
  349. "|/\./|",
  350. "|/[^\/]+/\.\./|"
  351. );
  352. $replace = array( "",
  353. $match."/",
  354. "/",
  355. "/"
  356. );
  357. $expandedLinks = preg_replace($search,$replace,$links);
  358.  
  359. return $expandedLinks;
  360. }
  361.  
  362. /*======================================================================*\
  363. Function: _httprequest
  364. Purpose: go get the http data from the server
  365. Input: $url the url to fetch
  366. $fp the current open file pointer
  367. $URI the full URI
  368. $body body contents to send if any (POST)
  369. Output:
  370. \*======================================================================*/
  371. function _httprequest($url,$fp,$URI,$http_method,$content_type="",$body="")
  372. {
  373. if($this->passcookies && $this->_redirectaddr)
  374. $this->setcookies();
  375. $URI_PARTS = parse_url($URI);
  376. if(empty($url))
  377. $url = "/";
  378. $headers = $http_method." ".$url." ".$this->_httpversion."\r\n";
  379. if(!empty($this->agent))
  380. $headers .= "User-Agent: ".$this->agent."\r\n";
  381. if(!empty($this->host) && !isset($this->rawheaders['Host']))
  382. $headers .= "Host: ".$this->host."\r\n";
  383. if(!empty($this->accept))
  384. $headers .= "Accept: ".$this->accept."\r\n";
  385. if($this->use_gzip) {
  386. // make sure PHP was built with --with-zlib
  387. // and we can handle gzipp'ed data
  388. if ( function_exists(gzinflate) ) {
  389. $headers .= "Accept-encoding: gzip\r\n";
  390. }
  391. else {
  392. trigger_error(
  393. "use_gzip is on, but PHP was built without zlib support.".
  394. " Requesting file(s) without gzip encoding.",
  395. E_USER_NOTICE);
  396. }
  397. }
  398. if(!empty($this->referer))
  399. $headers .= "Referer: ".$this->referer."\r\n";
  400. if(!empty($this->cookies))
  401. {
  402. if(!is_array($this->cookies))
  403. $this->cookies = (array)$this->cookies;
  404. reset($this->cookies);
  405. if ( count($this->cookies) > 0 ) {
  406. $cookie_headers .= 'Cookie: ';
  407. foreach ( $this->cookies as $cookieKey => $cookieVal ) {
  408. $cookie_headers .= $cookieKey."=".urlencode($cookieVal)."; ";
  409. }
  410. $headers .= substr($cookie_headers,0,-2) . "\r\n";
  411. }
  412. }
  413. if(!empty($this->rawheaders))
  414. {
  415. if(!is_array($this->rawheaders))
  416. $this->rawheaders = (array)$this->rawheaders;
  417. while(list($headerKey,$headerVal) = each($this->rawheaders))
  418. $headers .= $headerKey.": ".$headerVal."\r\n";
  419. }
  420. if(!empty($content_type)) {
  421. $headers .= "Content-type: $content_type";
  422. if ($content_type == "multipart/form-data")
  423. $headers .= "; boundary=".$this->_mime_boundary;
  424. $headers .= "\r\n";
  425. }
  426. if(!empty($body))
  427. $headers .= "Content-length: ".strlen($body)."\r\n";
  428. if(!empty($this->user) || !empty($this->pass))
  429. $headers .= "Authorization: BASIC ".base64_encode($this->user.":".$this->pass)."\r\n";
  430.  
  431. $headers .= "\r\n";
  432. // set the read timeout if needed
  433. if ($this->read_timeout > 0)
  434. socket_set_timeout($fp, $this->read_timeout);
  435. $this->timed_out = false;
  436. fwrite($fp,$headers.$body,strlen($headers.$body));
  437. $this->_redirectaddr = false;
  438. unset($this->headers);
  439. // content was returned gzip encoded?
  440. $is_gzipped = false;
  441. while($currentHeader = fgets($fp,$this->_maxlinelen))
  442. {
  443. if ($this->read_timeout > 0 && $this->_check_timeout($fp))
  444. {
  445. $this->status=-100;
  446. return false;
  447. }
  448. // if($currentHeader == "\r\n")
  449. if(preg_match("/^\r?\n$/", $currentHeader) )
  450. break;
  451. // if a header begins with Location: or URI:, set the redirect
  452. if(preg_match("/^(Location:|URI:)/i",$currentHeader))
  453. {
  454. // get URL portion of the redirect
  455. preg_match("/^(Location:|URI:)\s+(.*)/",chop($currentHeader),$matches);
  456. // look for :// in the Location header to see if hostname is included
  457. if(!preg_match("|\:\/\/|",$matches[2]))
  458. {
  459. // no host in the path, so prepend
  460. $this->_redirectaddr = $URI_PARTS["scheme"]."://".$this->host.":".$this->port;
  461. // eliminate double slash
  462. if(!preg_match("|^/|",$matches[2]))
  463. $this->_redirectaddr .= "/".$matches[2];
  464. else
  465. $this->_redirectaddr .= $matches[2];
  466. }
  467. else
  468. $this->_redirectaddr = $matches[2];
  469. }
  470. if(preg_match("|^HTTP/|",$currentHeader))
  471. {
  472. if(preg_match("|^HTTP/[^\s]*\s(.*?)\s|",$currentHeader, $status))
  473. {
  474. $this->status= $status[1];
  475. }
  476. $this->response_code = $currentHeader;
  477. }
  478. if (preg_match("/Content-Encoding: gzip/", $currentHeader) ) {
  479. $is_gzipped = true;
  480. }
  481. $this->headers[] = $currentHeader;
  482. }
  483.  
  484. # $results = fread($fp, $this->maxlength);
  485. $results = "";
  486. while ( $data = fread($fp, $this->maxlength) ) {
  487. $results .= $data;
  488. if (
  489. strlen($results) > $this->maxlength ) {
  490. break;
  491. }
  492. }
  493. // gunzip
  494. if ( $is_gzipped ) {
  495. // per http://www.php.net/manual/en/function.gzencode.php
  496. $results = substr($results, 10);
  497. $results = gzinflate($results);
  498. }
  499. if ($this->read_timeout > 0 && $this->_check_timeout($fp))
  500. {
  501. $this->status=-100;
  502. return false;
  503. }
  504. // check if there is a a redirect meta tag
  505. if(preg_match("'<meta[\s]*http-equiv[^>]*?content[\s]*=[\s]*[\"\']?\d+;[\s]+URL[\s]*=[\s]*([^\"\']*?)[\"\']?>'i",$results,$match))
  506. {
  507. $this->_redirectaddr = $this->_expandlinks($match[1],$URI);
  508. }
  509.  
  510. // have we hit our frame depth and is there frame src to fetch?
  511. if(($this->_framedepth < $this->maxframes) && preg_match_all("'<frame\s+.*src[\s]*=[\'\"]?([^\'\"\>]+)'i",$results,$match))
  512. {
  513. $this->results[] = $results;
  514. for($x=0; $x<count($match[1]); $x++)
  515. $this->_frameurls[] = $this->_expandlinks($match[1][$x],$URI_PARTS["scheme"]."://".$this->host);
  516. }
  517. // have we already fetched framed content?
  518. elseif(is_array($this->results))
  519. $this->results[] = $results;
  520. // no framed content
  521. else
  522. $this->results = $results;
  523. return true;
  524. }
  525.  
  526. /*======================================================================*\
  527. Function: _httpsrequest
  528. Purpose: go get the https data from the server using curl
  529. Input: $url the url to fetch
  530. $URI the full URI
  531. $body body contents to send if any (POST)
  532. Output:
  533. \*======================================================================*/
  534. function _httpsrequest($url,$URI,$http_method,$content_type="",$body="")
  535. {
  536. if($this->passcookies && $this->_redirectaddr)
  537. $this->setcookies();
  538.  
  539. $headers = array();
  540. $URI_PARTS = parse_url($URI);
  541. if(empty($url))
  542. $url = "/";
  543. // GET ... header not needed for curl
  544. //$headers[] = $http_method." ".$url." ".$this->_httpversion;
  545. if(!empty($this->agent))
  546. $headers[] = "User-Agent: ".$this->agent;
  547. if(!empty($this->host))
  548. $headers[] = "Host: ".$this->host;
  549. if(!empty($this->accept))
  550. $headers[] = "Accept: ".$this->accept;
  551. if(!empty($this->referer))
  552. $headers[] = "Referer: ".$this->referer;
  553. if(!empty($this->cookies))
  554. {
  555. if(!is_array($this->cookies))
  556. $this->cookies = (array)$this->cookies;
  557. reset($this->cookies);
  558. if ( count($this->cookies) > 0 ) {
  559. $cookie_str = 'Cookie: ';
  560. foreach ( $this->cookies as $cookieKey => $cookieVal ) {
  561. $cookie_str .= $cookieKey."=".urlencode($cookieVal)."; ";
  562. }
  563. $headers[] = substr($cookie_str,0,-2);
  564. }
  565. }
  566. if(!empty($this->rawheaders))
  567. {
  568. if(!is_array($this->rawheaders))
  569. $this->rawheaders = (array)$this->rawheaders;
  570. while(list($headerKey,$headerVal) = each($this->rawheaders))
  571. $headers[] = $headerKey.": ".$headerVal;
  572. }
  573. if(!empty($content_type)) {
  574. if ($content_type == "multipart/form-data")
  575. $headers[] = "Content-type: $content_type; boundary=".$this->_mime_boundary;
  576. else
  577. $headers[] = "Content-type: $content_type";
  578. }
  579. if(!empty($body))
  580. $headers[] = "Content-length: ".strlen($body);
  581. if(!empty($this->user) || !empty($this->pass))
  582. $headers[] = "Authorization: BASIC ".base64_encode($this->user.":".$this->pass);
  583. for($curr_header = 0; $curr_header < count($headers); $curr_header++)
  584. $cmdline_params .= " -H \"".$headers[$curr_header]."\"";
  585. if(!empty($body))
  586. $cmdline_params .= " -d \"$body\"";
  587. if($this->read_timeout > 0)
  588. $cmdline_params .= " -m ".$this->read_timeout;
  589. $headerfile = uniqid(time());
  590. # accept self-signed certs
  591. $cmdline_params .= " -k";
  592. exec($this->curl_path." -D \"/tmp/$headerfile\"".$cmdline_params." ".$URI,$results,$return);
  593. if($return)
  594. {
  595. $this->error = "Error: cURL could not retrieve the document, error $return.";
  596. return false;
  597. }
  598. $results = implode("\r\n",$results);
  599. $result_headers = file("/tmp/$headerfile");
  600. $this->_redirectaddr = false;
  601. unset($this->headers);
  602. for($currentHeader = 0; $currentHeader < count($result_headers); $currentHeader++)
  603. {
  604. // if a header begins with Location: or URI:, set the redirect
  605. if(preg_match("/^(Location: |URI: )/i",$result_headers[$currentHeader