Няма описание

SaeFetchurl.class.php 19KB

    <?php /** * SAE数据抓取服务 * * @author zhiyong * @version $Id: SaeFetchurl.class.php 2766 2012-02-20 15:58:21Z luofei614@gmail.com $ * @package sae * */ /** * SAE数据抓取class * * SaeFetchurl用于抓取外部数据。支持的协议为http/https。<br /> * 该类已被废弃,请直接使用curl抓取外部资源 * @deprecated 该类已被废弃,请直接使用curl抓取外部资源 * * 默认超时时间: * - 连接超时: 5秒 * - 发送数据超时: 30秒 * - 接收数据超时: 40秒 * * 抓取页面 * <code> * $f = new SaeFetchurl(); * $content = $f->fetch('http://sina.cn'); * </code> * * 发起POST请求 * <code> * $f = new SaeFetchurl(); * $f->setMethod('post'); * $f->setPostData( array('name'=> 'easychen' , 'email' => 'easychen@gmail.com' , 'file' => '文件的二进制内容') ); * $ret = $f->fetch('http://photo.sinaapp.com/save.php'); * * //抓取失败时输出错误码和错误信息 * if ($ret === false) * var_dump($f->errno(), $f->errmsg()); * </code> * * 错误码参考: * - errno: 0 成功 * - errno: 600 fetchurl 服务内部错误 * - errno: 601 accesskey 不存在 * - errno: 602 认证错误,可能是secretkey错误 * - errno: 603 超出fetchurl的使用配额 * - errno: 604 REST 协议错误,相关的header不存在或其它错误,建议使用SAE提供的fetch_url函数 * - errno: 605 请求的URI格式不合法 * - errno: 606 请求的URI,服务器不可达。 * * @author zhiyong * @version $Id: SaeFetchurl.class.php 2766 2012-02-20 15:58:21Z luofei614@gmail.com $ * @package sae * */ class SaeFetchurl extends SaeObject { function __construct( $akey = NULL , $skey = NULL ) { if( $akey === NULL ) $akey = SAE_ACCESSKEY; if( $skey === NULL ) $skey = SAE_SECRETKEY; $this->impl_ = new FetchUrl($akey, $skey); $this->method_ = "get"; $this->cookies_ = array(); $this->opt_ = array(); $this->headers_ = array(); } /** * 设置acccesskey和secretkey * * 使用当前的应用的key时,不需要调用此方法 * * @param string $akey * @param string $skey * @return void * @author zhiyong * @ignore */ public function setAuth( $akey , $skey ) { $this->impl_->setAccesskey($akey); $this->impl_->setSecretkey($skey); } /** * @ignore */ public function setAccesskey( $akey ) { $this->impl_->setAccesskey($akey); } /** * @ignore */ public function setSecretkey( $skey ) { $this->impl_->setSecretkey($skey); } /** * 设置请求的方法(POST/GET/PUT... ) * * @param string $method * @return void * @author zhiyong */ public function setMethod( $method ) { $this->method_ = trim($method); $this->opt_['method'] = trim($method); } /** * 设置POST方法的数据 * * @param array|string $post_data 当格式为array时,key为变量名称,value为变量值,使用multipart方式提交。当格式为string时,直接做为post的content提交。与curl_setopt($ch, CURLOPT_POSTFIELDS, $data)中$data的格式相同。 * @param bool $multipart value是否为二进制数据 * @return bool * @author zhiyong */ public function setPostData( $post_data , $multipart = false ) { $this->opt_["post"] = $post_data; $this->opt_["multipart"] = $multipart; return true; } /** * 在发起的请求中,添加请求头 * * 不可以使用此方法设定的头: * - Content-Length * - Host * - Vary * - Via * - X-Forwarded-For * - FetchUrl * - AccessKey * - TimeStamp * - Signature * - AllowTruncated //可使用setAllowTrunc方法来进行设定 * - ConnectTimeout //可使用setConnectTimeout方法来进行设定 * - SendTimeout //可使用setSendTimeout方法来进行设定 * - ReadTimeout //可使用setReadTimeout方法来进行设定 * * * @param string $name * @param string $value * @return bool * @author zhiyong */ public function setHeader( $name , $value ) { $name = trim($name); if (!in_array(strtolower($name), FetchUrl::$disabledHeaders)) { $this->headers_[$name] = $value; return true; } else { trigger_error("Disabled FetchUrl Header:" . $name, E_USER_NOTICE); return false; } } /** * 设置FetchUrl参数 * * 参数列表: * - truncated 布尔 是否截断 * - redirect 布尔 是否支持重定向 * - username 字符串 http认证用户名 * - password 字符串 http认证密码 * - useragent 字符串 自定义UA * * @param string $name * @param string $value * @return void * @author Elmer Zhang * @ignore */ public function setOpt( $name , $value ) { $name = trim($name); $this->opt_[$name] = $value; } /** * 在发起的请求中,批量添加cookie数据 * * @param array $cookies 要添加的Cookies,格式:array('key1' => 'value1', 'key2' => 'value2', ....) * @return void * @author zhiyong */ public function setCookies( $cookies = array() ) { if ( is_array($cookies) and !empty($cookies) ) { foreach ( $cookies as $k => $v ) { $this->setCookie($k, $v); } } } /** * 在发起的请求中,添加cookie数据,此函数可多次调用,添加多个cookie * * @param string $name * @param string $value * @return void * @author zhiyong */ public function setCookie( $name , $value ) { $name = trim($name); array_push($this->cookies_, "$name=$value"); } /** * 是否允许截断,默认为不允许 * * 如果设置为true,当发送数据超过允许大小时,自动截取符合大小的部分;<br /> * 如果设置为false,当发送数据超过允许大小时,直接返回false; * * @param bool $allow * @return void * @author zhiyong */ public function setAllowTrunc($allow) { $this->opt_["truncated"] = $allow; } /** * 设置连接超时时间,此时间必须小于SAE系统设置的时间,否则以SAE系统设置为准(默认为5秒) * * @param int $ms 毫秒 * @return void * @author zhiyong */ public function setConnectTimeout($ms) { $this->opt_["connecttimeout"] = $ms; } /** * 设置发送超时时间,此时间必须小于SAE系统设置的时间,否则以SAE系统设置为准(默认为20秒) * * @param int $ms 毫秒 * @return void * @author zhiyong */ public function setSendTimeout($ms) { $this->opt_["sendtimeout"] = $ms; } /** * 设置读取超时时间,此时间必须小于SAE系统设置的时间,否则以SAE系统设置为准(默认为60秒) * * @param int $ms 毫秒 * @return void * @author zhiyong */ public function setReadTimeout($ms) { $this->opt_["ReadTimeout"] = $ms; } /** * 当请求页面是转向页时,是否允许跳转,SAE最大支持5次跳转(默认不跳转) * * @param bool $allow 是否允许跳转。true:允许,false:禁止,默认为true * @return void * @author zhiyong */ public function setAllowRedirect($allow = true) { $this->opt_["redirect"] = $allow; } /** * 设置HTTP认证用户名密码 * * @param string $username HTTP认证用户名 * @param string $password HTTP认证密码 * @return void * @author zhiyong */ public function setHttpAuth($username, $password) { $this->opt_["username"] = $username; $this->opt_["password"] = $password; } /** * 发起请求 * * <code> * <?php * echo "Use callback function\n"; * * function demo($content) { * echo strtoupper($content); * } * * $furl = new SaeFetchurl(); * $furl->fetch($url, $opt, 'demo'); * * echo "Use callback class\n"; * * class Ctx { * public function demo($content) { * $this->c .= $content; * } * public $c; * }; * * $ctx = new Ctx; * $furl = new SaeFetchurl(); * $furl->fetch($url, $opt, array($ctx, 'demo')); * echo $ctx->c; * ?> * </code> * * @param string $url * @param array $opt 请求参数,格式:array('key1'=>'value1', 'key2'=>'value2', ... )。参数列表: * - truncated 布尔 是否截断 * - redirect 布尔 是否支持重定向 * - username 字符串 http认证用户名 * - password 字符串 http认证密码 * - useragent 字符串 自定义UA * @param callback $callback 用来处理返回的数据的函数。可以为函数名或某个实例对象的方法。 * @return string 成功时读取到的内容,否则返回false * @author zhiyong */ public function fetch( $url, $opt = NULL, $callback=NULL ) { if (count($this->cookies_) != 0) { $this->opt_["cookie"] = join("; ", $this->cookies_); } $opt = ($opt) ? array_merge($this->opt_, $opt) : $this->opt_; return $this->impl_->fetch($url, $opt, $this->headers_, $callback); } /** * 返回数据的header信息 * * @param bool $parse 是否解析header,默认为true。 * @return array * @author zhiyong */ public function responseHeaders($parse = true) { $items = explode("\r\n", $this->impl_->headerContent()); if (!$parse) { return $items; } array_shift($items); $headers = array(); foreach ($items as $_) { $pos = strpos($_, ":"); $key = trim(substr($_, 0, $pos)); $value = trim(substr($_, $pos + 1)); if ($key == "Set-Cookie") { if (array_key_exists($key, $headers)) { array_push($headers[$key], trim($value)); } else { $headers[$key] = array(trim($value)); } } else { $headers[$key] = trim($value); } } return $headers; } /** * 返回HTTP状态码 * * @return int * @author Elmer Zhang */ public function httpCode() { return $this->impl_->httpCode(); } /** * 返回网页内容 * 常用于fetch()方法返回false时 * * @return string * @author Elmer Zhang */ public function body() { return $this->impl_->body(); } /** * 返回头里边的cookie信息 * * @param bool $all 是否返回完整Cookies信息。为true时,返回Cookie的name,value,path,max-age,为false时,只返回Cookies的name, value * @return array * @author zhiyong */ public function responseCookies($all = true) { $header = $this->impl_->headerContent(); $matchs = array(); $cookies = array(); $kvs = array(); if (preg_match_all('/Set-Cookie:\s([^\r\n]+)/i', $header, $matchs)) { foreach ($matchs[1] as $match) { $cookie = array(); $items = explode(";", $match); foreach ($items as $_) { $item = explode("=", trim($_)); $cookie[$item[0]]= $item[1]; } array_push($cookies, $cookie); $kvs = array_merge($kvs, $cookie); } } if ($all) { return $cookies; } else { unset($kvs['path']); unset($kvs['max-age']); return $kvs; } } /** * 返回错误码 * * @return int * @author zhiyong */ public function errno() { if ($this->impl_->errno() != 0) { return $this->impl_->errno(); } else { if ($this->impl_->httpCode() != 200) { return $this->impl_->httpCode(); } } return 0; } /** * 返回错误信息 * * @return string * @author zhiyong */ public function errmsg() { if ($this->impl_->errno() != 0) { return $this->impl_->error(); } else { if ($this->impl_->httpCode() != 200) { return $this->impl_->httpDesc(); } } return ""; } /** * 将对象的数据重新初始化,用于多次重用一个SaeFetchurl对象 * * @return void * @author Elmer Zhang */ public function clean() { $this->__construct(); } /** * 开启/关闭调试模式 * * @param bool $on true:开启调试;false:关闭调试 * @return void * @author Elmer Zhang */ public function debug($on) { if ($on) { $this->impl_->setDebugOn(); } else { $this->impl_->setDebugOff(); } } private $impl_; private $opt_; private $headers_; } /** * FetchUrl , the sub class of SaeFetchurl * * * @package sae * @subpackage fetchurl * @author zhiyong * @ignore */ class FetchUrl { const end_ = "http://fetchurl.sae.sina.com.cn/" ; const maxRedirect_ = 5; public static $disabledHeaders = array( 'content-length', 'host', 'vary', 'via', 'x-forwarded-for', 'fetchurl', 'accesskey', 'timestamp', 'signature', 'allowtruncated', 'connecttimeout', 'sendtimeout', 'readtimeout', ); public function __construct($accesskey, $secretkey) { $accesskey = trim($accesskey); $secretkey = trim($secretkey); $this->accesskey_ = $accesskey; $this->secretkey_ = $secretkey; $this->errno_ = 0; $this->error_ = null; $this->debug_ = false; } public function __destruct() { // do nothing } public function setAccesskey($accesskey) { $accesskey = trim($accesskey); $this->accesskey_ = $accesskey; } public function setSecretkey($secretkey) { $secretkey = trim($secretkey); $this->secretkey_ = $secretkey; } public function setDebugOn() { $this->debug_ = true; } public function setDebugOff() { $this->debug_ = false; } public function fetch($url, $opt = null, $headers = null, $callback = null) { $url = trim($url); if (substr($url, 0, 7) != 'http://' && substr($url, 0, 8) != 'https://') { $url = 'http://' . $url; } $this->callback_ = $callback; $maxRedirect = FetchUrl::maxRedirect_; if (is_array($opt) && array_key_exists('redirect',$opt) && !$opt['redirect']) { $maxRedirect = 1; } for ($i = 0; $i < $maxRedirect; ++$i) { $this->dofetch($url, $opt, $headers); if ($this->errno_ == 0) { if ($this->httpCode_ == 301 || $this->httpCode_ == 302) { $matchs = array(); if (preg_match('/Location:\s([^\r\n]+)/i', $this->header_, $matchs)) { $newUrl = $matchs[1]; // if new domain if (strncasecmp($newUrl, "http://", strlen("http://")) == 0) { $url = $newUrl; } else { $url = preg_replace('/^((?:https?:\/\/)?[^\/]+)\/(.*)$/i', '$1', $url) . "/". $newUrl; } if ($this->debug_) { echo "[debug] redirect to $url\n"; } continue; } } } break; } if ($this->errno_ == 0 && $this->httpCode_ == 200) { return $this->body_; } else { return false; } } public function headerContent() { return $this->header_; } public function errno() { return $this->errno_; } public function error() { return $this->error_; } public function httpCode() { return $this->httpCode_; } public function body() { return $this->body_; } public function httpDesc() { return $this->httpDesc_; } private function signature($url, $timestamp) { $content = "FetchUrl" . $url . "TimeStamp" . $timestamp . "AccessKey" . $this->accesskey_; $signature = (base64_encode(hash_hmac('sha256',$content,$this->secretkey_,true))); if ($this->debug_) { echo "[debug] content: $content" . "\n"; echo "[debug] signature: $signature" . "\n"; } return $signature; } // we have to set wirteBody & writeHeader public // for we used them in curl_setopt() public function writeBody($ch, $body) { if ($this->callback_) { call_user_func($this->callback_, $body); } else { $this->body_ .= $body; } if ($this->debug_) { echo "[debug] body => $body"; } return strlen($body); } public function writeHeader($ch, $header) { $this->header_ .= $header; if ($this->debug_) { echo "[debug] header => $header"; } return strlen($header); } private function dofetch($url, $opt, $headers_) { $this->header_ = $this->body_ = null; $headers = array(); $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch,CURLOPT_SSL_VERIFYPEER,false) ; curl_setopt($ch,CURLOPT_SSL_VERIFYHOST,true) ; curl_setopt($ch, CURLOPT_WRITEFUNCTION, array($this, 'writeBody')); curl_setopt($ch, CURLOPT_HEADERFUNCTION, array($this, 'writeHeader')); if ($this->debug_) { curl_setopt($ch, CURLOPT_VERBOSE, true); } if (is_array($opt) && !empty($opt)) { foreach( $opt as $k => $v) { switch(strtolower($k)) { case 'username': if (array_key_exists("password",$opt)) { curl_setopt($ch, CURLOPT_USERPWD, $v . ":" . $opt["password"]); } break; case 'password': if (array_key_exists("username",$opt)) { curl_setopt($ch, CURLOPT_USERPWD, $opt["username"] . ":" . $v); } break; case 'useragent': curl_setopt($ch, CURLOPT_USERAGENT, $v); break; case 'post': curl_setopt($ch, CURLOPT_POSTFIELDS, $v); break; case 'cookie': curl_setopt($ch, CURLOPT_COOKIESESSION, true); curl_setopt($ch, CURLOPT_COOKIE, $v); break; case 'multipart': if ($v) array_push($headers, "Content-Type: multipart/form-data"); break; case 'truncated': array_push($headers, "AllowTruncated:" . $v); break; case 'connecttimeout': array_push($headers, "ConnectTimeout:" . intval($v)); break; case 'sendtimeout': array_push($headers, "SendTimeout:" . intval($v)); break; case 'readtimeout': array_push($headers, "ReadTimeout:" . intval($v)); break; default: break; } } } if (isset($opt['method'])) { if (strtolower($opt['method']) == 'get') { curl_setopt($ch, CURLOPT_HTTPGET, true); } } if (is_array($headers_) && !empty($headers_)) { foreach($headers_ as $k => $v) { if (!in_array(strtolower($k), FetchUrl::$disabledHeaders)) { array_push($headers, "{$k}:" . $v); } } } curl_setopt($ch, CURLOPT_HTTPHEADER, $headers); curl_setopt($ch, CURLOPT_ENCODING, ""); curl_setopt($ch, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_0); curl_setopt($ch, CURLINFO_HEADER_OUT, true); curl_exec($ch); $info = curl_getinfo($ch); if ($this->debug_) { echo "[debug] curl_getinfo => " . print_r($info, true) . "\n"; } $this->errno_ = curl_errno($ch); $this->error_ = curl_error($ch); if ($this->errno_ == 0) { $matchs = array(); if (preg_match('/^(?:[^\s]+)\s([^\s]+)\s([^\r\n]+)/', $this->header_, $matchs)) { $this->httpCode_ = $matchs[1]; $this->httpDesc_ = $matchs[2]; if ($this->debug_) { echo "[debug] httpCode = " . $this->httpCode_ . " httpDesc = " . $this->httpDesc_ . "\n"; } } else { $this->errno_ = -1; $this->error_ = "invalid response"; } } curl_close($ch); } private $accesskey_; private $secretkey_; private $errno_; private $error_; private $httpCode_; private $httpDesc_; private $header_; private $body_; private $debug_; }