Source for file Snoopy.class.inc
Documentation is available at Snoopy.class.inc
/*************************************************
Snoopy - the PHP net client
Author: Monte Ohrt <monte@ispi.net>
Copyright (c): 1999-2000 ispi, all rights reserved
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
You may contact the author of Snoopy by e-mail at:
The latest version of Snoopy can be obtained from:
http://snoopy.sourceforge.com
*************************************************/
/**** Public variables ****/
/* user definable vars */
var $host =
"www.php.net"; // host name we are connecting to
var $port =
80; // port we are connecting to
var $agent =
"Snoopy v1.0"; // agent we masquerade as
var $referer =
""; // referer info to pass
var $cookies =
array(); // array of cookies to pass
// $cookies["username"]="joe";
var $rawheaders =
array(); // array of raw headers to send
// $rawheaders["Content-type"]="text/html";
var $maxredirs =
5; // http redirection depth maximum. 0 = disallow
var $offsiteok =
true; // allows redirection off-site
var $maxframes =
0; // frame content depth maximum. 0 = disallow
var $expandlinks =
true; // expand links to fully qualified URLs.
// this only applies to fetchlinks()
var $passcookies =
true; // pass set cookies back through redirects
// NOTE: this currently does not respect
// dates, domains or paths.
var $user =
""; // user for http authentication
var $pass =
""; // password for http authentication
var $accept =
"image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, */*";
var $results =
""; // where the content is put
var $error =
""; // error messages sent here
var $headers =
array(); // headers returned from server sent here
var $maxlength =
500000; // max return data length (body)
// supported only since PHP 4 Beta 4
// set to 0 to disallow timeouts
var $timed_out =
false; // if a read operation timed out
var $status =
0; // http request status
// Snoopy will use cURL for fetching
// SSL content if a full system path to
// the cURL binary is supplied here.
// set to false if you do not have
// cURL installed. See http://curl.haxx.se
// for details on installing cURL.
// Snoopy does *not* use the cURL
// library functions built into php,
// as these functions are not stable
// as of this Snoopy release.
// send Accept-encoding: gzip?
/**** Private variables ****/
var $_submit_type =
"application/x-www-form-urlencoded"; // default submit type
var $_mime_boundary =
""; // MIME boundary for multipart/form-data submit type
var $_redirectaddr =
false; // will be set if page fetched is a redirect
var $_isproxy =
false; // set if using a proxy server
/*======================================================================*\
Purpose: fetch the contents of a web page
(and possibly other protocols in the
future like ftp, nntp, gopher, etc.)
Input: $URI the location of the page to fetch
Output: $this->results the output text from the fetch
\*======================================================================*/
//preg_match("|^([^:]+)://([^:/]+)(:[\d]+)*(.*)|",$URI,$URI_PARTS);
if (!empty($URI_PARTS["user"]))
$this->user =
$URI_PARTS["user"];
if (!empty($URI_PARTS["pass"]))
$this->pass =
$URI_PARTS["pass"];
switch($URI_PARTS["scheme"])
$this->host =
$URI_PARTS["host"];
if(!empty($URI_PARTS["port"]))
$this->port =
$URI_PARTS["port"];
// using proxy, send entire URI
$path =
$URI_PARTS["path"].
(isset
($URI_PARTS["query"]) ?
"?".
$URI_PARTS["query"] :
"");
// no proxy, send only the path
/* url was redirected, check if we've hit the max depth */
// only follow redirect if it's on this site, or offsiteok is true
/* follow the redirect */
while(list
(,$frameurl) =
each($frameurls))
$this->host =
$URI_PARTS["host"];
if(!empty($URI_PARTS["port"]))
$this->port =
$URI_PARTS["port"];
// using proxy, send entire URI
$path =
$URI_PARTS["path"].
($URI_PARTS["query"] ?
"?".
$URI_PARTS["query"] :
"");
// no proxy, send only the path
/* url was redirected, check if we've hit the max depth */
// only follow redirect if it's on this site, or offsiteok is true
/* follow the redirect */
while(list
(,$frameurl) =
each($frameurls))
$this->error =
'Invalid protocol "'.
$URI_PARTS["scheme"].
'"\n';
/*======================================================================*\
\*======================================================================*/
/*======================================================================*\
Purpose: strip the hyperlinks from an html document
Input: $document document to strip.
Output: $match an array of the links
\*======================================================================*/
([\"\'])? # find single or double quote
(?(1) (.*?)\\1 | ([^\s\>]+)) # if quote found, match up to next matching
# quote, otherwise match up to next space
// catenate the non-empty matches from the conditional subpattern
while(list
($key,$val) =
each($links[2]))
while(list
($key,$val) =
each($links[3]))
/*======================================================================*\
Purpose: strip the form elements from an html document
Input: $document document to strip.
Output: $match an array of the links
\*======================================================================*/
preg_match_all("'<\/?(FORM|INPUT|SELECT|TEXTAREA|(OPTION))[^<>]*>(?(2)(.*(?=<\/?(option|select)[^<>]*>[\r\n]*)|(?=[\r\n]*))|(?=[\r\n]*))'Usi",$document,$elements);
$match =
implode("\r\n",$elements[0]);
/*======================================================================*\
Purpose: strip the text from an html document
Input: $document document to strip.
Output: $text the resulting text
\*======================================================================*/
// I didn't use preg eval (//e) since that is only available in PHP 4.0.
// so, list your entities one by one here. I included some of the
$search =
array("'<script[^>]*?>.*?</script>'si", // strip out javascript
"'<[\/\!]*?[^<>]*?>'si", // strip out html tags
"'([\r\n])[\s]+'", // strip out white space
"'&(quote|#34);'i", // replace html entities
/*======================================================================*\
Purpose: expand each link into a fully qualified URL
Input: $links the links to qualify
$URI the full URI to get the base from
Output: $expandedLinks the expanded links
\*======================================================================*/
$match =
preg_replace("|/[^\/\.]+\.[^\/\.]+$|","",$match[0]);
"|^(?!http://)(\/)?(?!mailto:)|i",
/*======================================================================*\
Purpose: go get the http data from the server
Input: $url the url to fetch
$fp the current open file pointer
$body body contents to send if any (POST)
\*======================================================================*/
function _httprequest($url,$fp,$URI,$http_method,$content_type=
"",$body=
"")
$headers =
$http_method.
" ".
$url.
" ".
$this->_httpversion.
"\r\n";
$headers .=
"User-Agent: ".
$this->agent.
"\r\n";
$headers .=
"Host: ".
$this->host.
"\r\n";
$headers .=
"Accept: ".
$this->accept.
"\r\n";
// make sure PHP was built with --with-zlib
// and we can handle gzipp'ed data
$headers .=
"Accept-encoding: gzip\r\n";
"use_gzip is on, but PHP was built without zlib support.".
" Requesting file(s) without gzip encoding.",
$headers .=
"Referer: ".
$this->referer.
"\r\n";
$cookie_headers .=
'Cookie: ';
foreach ( $this->cookies as $cookieKey =>
$cookieVal ) {
$cookie_headers .=
$cookieKey.
"=".
urlencode($cookieVal).
"; ";
$headers .=
substr($cookie_headers,0,-
2) .
"\r\n";
$headers .=
$headerKey.
": ".
$headerVal.
"\r\n";
if(!empty($content_type)) {
$headers .=
"Content-type: $content_type";
if ($content_type ==
"multipart/form-data")
$headers .=
"Content-length: ".
strlen($body).
"\r\n";
if(!empty($this->user) ||
!empty($this->pass))
// set the read timeout if needed
// content was returned gzip encoded?
// if($currentHeader == "\r\n")
// if a header begins with Location: or URI:, set the redirect
if(preg_match("/^(Location:|URI:)/i",$currentHeader))
// get URL portion of the redirect
preg_match("/^(Location:|URI:)\s+(.*)/",chop($currentHeader),$matches);
// look for :// in the Location header to see if hostname is included
// no host in the path, so prepend
// eliminate double slash
if(preg_match("|^HTTP/[^\s]*\s(.*?)\s|",$currentHeader, $status))
if (preg_match("/Content-Encoding: gzip/", $currentHeader) ) {
# $results = fread($fp, $this->maxlength);
// per http://www.php.net/manual/en/function.gzencode.php
$results =
substr($results, 10);
// check if there is a a redirect meta tag
if(preg_match("'<meta[\s]*http-equiv[^>]*?content[\s]*=[\s]*[\"\']?\d+;[\s]+URL[\s]*=[\s]*([^\"\']*?)[\"\']?>'i",$results,$match))
// have we hit our frame depth and is there frame src to fetch?
for($x=
0; $x<
count($match[1]); $x++
)
// have we already fetched framed content?
/*======================================================================*\
Purpose: go get the https data from the server using curl
Input: $url the url to fetch
$body body contents to send if any (POST)
\*======================================================================*/
function _httpsrequest($url,$URI,$http_method,$content_type=
"",$body=
"")
// GET ... header not needed for curl
//$headers[] = $http_method." ".$url." ".$this->_httpversion;
$headers[] =
"User-Agent: ".
$this->agent;
$headers[] =
"Host: ".
$this->host;
$headers[] =
"Accept: ".
$this->accept;
$headers[] =
"Referer: ".
$this->referer;
$cookie_str =
'Cookie: ';
foreach ( $this->cookies as $cookieKey =>
$cookieVal ) {
$cookie_str .=
$cookieKey.
"=".
urlencode($cookieVal).
"; ";
$headers[] =
substr($cookie_str,0,-
2);
$headers[] =
$headerKey.
": ".
$headerVal;
if(!empty($content_type)) {
if ($content_type ==
"multipart/form-data")
$headers[] =
"Content-type: $content_type; boundary=".
$this->_mime_boundary;
$headers[] =
"Content-type: $content_type";
$headers[] =
"Content-length: ".
strlen($body);
if(!empty($this->user) ||
!empty($this->pass))
for($curr_header =
0; $curr_header <
count($headers); $curr_header++
) {
$cmdline_params .=
" -H \"".
$headers[$curr_header].
"\"";
$cmdline_params .=
" -d \"$body\"";
# accept self-signed certs
$cmdline_params .=
" -k";
$this->error =
"Error: cURL could not retrieve the document, error $return.";
$results =
implode("\r\n",$results);
$result_headers =
file("/tmp/$headerfile");
for($currentHeader =
0; $currentHeader <
count($result_headers); $currentHeader++
)
// if a header begins with Location: or URI:, set the redirect
if(preg_match("/^(Location: |URI: )/i",$result_headers[$currentHeader]))
// get URL portion of the redirect
preg_match("/^(Location: |URI:)(.*)/",chop($result_headers[$currentHeader]),$matches);
// look for :// in the Location header to see if hostname is included
// no host in the path, so prepend
// eliminate double slash
if(preg_match("|^HTTP/|",$result_headers[$currentHeader]))
$this->headers[] =
$result_headers[$currentHeader];
// check if there is a a redirect meta tag
if(preg_match("'<meta[\s]*http-equiv[^>]*?content[\s]*=[\s]*[\"\']?\d+;[\s]+URL[\s]*=[\s]*([^\"\']*?)[\"\']?>'i",$results,$match))
// have we hit our frame depth and is there frame src to fetch?
for($x=
0; $x<
count($match[1]); $x++
)
// have we already fetched framed content?
/*======================================================================*\
Purpose: set cookies for a redirection
\*======================================================================*/
$this->cookies[$match[1]] =
$match[2];
/*======================================================================*\
Purpose: checks whether timeout has occurred
\*======================================================================*/
if ($fp_status["timed_out"]) {
/*======================================================================*\
Purpose: make a socket connection
\*======================================================================*/
// socket connection succeeded
// socket connection failed
$this->error=
"socket creation failed (-3)";
$this->error=
"dns lookup failure (-4)";
$this->error=
"connection refused or timed out (-5)";
$this->error=
"connection failed (".
$errno.
")";
/*======================================================================*\
Purpose: disconnect a socket connection
\*======================================================================*/
/*======================================================================*\
Function: _prepare_post_body
Purpose: Prepare post body according to encoding type
Input: $formvars - form variables
$formfiles - form upload files
\*======================================================================*/
if (count($formvars) ==
0 &&
count($formfiles) ==
0)
case "application/x-www-form-urlencoded":
while(list
($key,$val) =
each($formvars)) {
while (list
($cur_key, $cur_val) =
each($val)) {
case "multipart/form-data":
while(list
($key,$val) =
each($formvars)) {
while (list
($cur_key, $cur_val) =
each($val)) {
$postdata .=
"Content-Disposition: form-data; name=\"$key\[\]\"\r\n\r\n";
$postdata .=
"$cur_val\r\n";
$postdata .=
"Content-Disposition: form-data; name=\"$key\"\r\n\r\n";
while (list
($field_name, $file_names) =
each($formfiles)) {
while (list
(, $file_name) =
each($file_names)) {
$fp =
fopen($file_name, "r");
$postdata .=
"Content-Disposition: form-data; name=\"$field_name\"; filename=\"$base_name\"\r\n\r\n";
$postdata .=
"$file_content\r\n";
Documentation generated on Tue, 01 May 2007 16:47:18 +0200 by phpDocumentor 1.3.2