|
<?php
/**
* SAE数据抓取服务
*
* @author zhiyong
* @version $Id: SaeFetchurl.class.php 2766 2012-02-20 15:58:21Z luofei614@gmail.com $
* @package sae
*
*/
/**
* SAE数据抓取class
*
* SaeFetchurl用于抓取外部数据。支持的协议为http/https。<br />
* 该类已被废弃,请直接使用curl抓取外部资源
* @deprecated 该类已被废弃,请直接使用curl抓取外部资源
*
* 默认超时时间:
* - 连接超时: 5秒
* - 发送数据超时: 30秒
* - 接收数据超时: 40秒
*
* 抓取页面
* <code>
* $f = new SaeFetchurl();
* $content = $f->fetch('http://sina.cn');
* </code>
*
* 发起POST请求
* <code>
* $f = new SaeFetchurl();
* $f->setMethod('post');
* $f->setPostData( array('name'=> 'easychen' , 'email' => 'easychen@gmail.com' , 'file' => '文件的二进制内容') );
* $ret = $f->fetch('http://photo.sinaapp.com/save.php');
*
* //抓取失败时输出错误码和错误信息
* if ($ret === false)
* var_dump($f->errno(), $f->errmsg());
* </code>
*
* 错误码参考:
* - errno: 0 成功
* - errno: 600 fetchurl 服务内部错误
* - errno: 601 accesskey 不存在
* - errno: 602 认证错误,可能是secretkey错误
* - errno: 603 超出fetchurl的使用配额
* - errno: 604 REST 协议错误,相关的header不存在或其它错误,建议使用SAE提供的fetch_url函数
* - errno: 605 请求的URI格式不合法
* - errno: 606 请求的URI,服务器不可达。
*
* @author zhiyong
* @version $Id: SaeFetchurl.class.php 2766 2012-02-20 15:58:21Z luofei614@gmail.com $
* @package sae
*
*/
class SaeFetchurl extends SaeObject
{
function __construct( $akey = NULL , $skey = NULL )
{
if( $akey === NULL )
$akey = SAE_ACCESSKEY;
if( $skey === NULL )
$skey = SAE_SECRETKEY;
$this->impl_ = new FetchUrl($akey, $skey);
$this->method_ = "get";
$this->cookies_ = array();
$this->opt_ = array();
$this->headers_ = array();
}
/**
* 设置acccesskey和secretkey
*
* 使用当前的应用的key时,不需要调用此方法
*
* @param string $akey
* @param string $skey
* @return void
* @author zhiyong
* @ignore
*/
public function setAuth( $akey , $skey )
{
$this->impl_->setAccesskey($akey);
$this->impl_->setSecretkey($skey);
}
/**
* @ignore
*/
public function setAccesskey( $akey )
{
$this->impl_->setAccesskey($akey);
}
/**
* @ignore
*/
public function setSecretkey( $skey )
{
$this->impl_->setSecretkey($skey);
}
/**
* 设置请求的方法(POST/GET/PUT... )
*
* @param string $method
* @return void
* @author zhiyong
*/
public function setMethod( $method )
{
$this->method_ = trim($method);
$this->opt_['method'] = trim($method);
}
/**
* 设置POST方法的数据
*
* @param array|string $post_data 当格式为array时,key为变量名称,value为变量值,使用multipart方式提交。当格式为string时,直接做为post的content提交。与curl_setopt($ch, CURLOPT_POSTFIELDS, $data)中$data的格式相同。
* @param bool $multipart value是否为二进制数据
* @return bool
* @author zhiyong
*/
public function setPostData( $post_data , $multipart = false )
{
$this->opt_["post"] = $post_data;
$this->opt_["multipart"] = $multipart;
return true;
}
/**
* 在发起的请求中,添加请求头
*
* 不可以使用此方法设定的头:
* - Content-Length
* - Host
* - Vary
* - Via
* - X-Forwarded-For
* - FetchUrl
* - AccessKey
* - TimeStamp
* - Signature
* - AllowTruncated //可使用setAllowTrunc方法来进行设定
* - ConnectTimeout //可使用setConnectTimeout方法来进行设定
* - SendTimeout //可使用setSendTimeout方法来进行设定
* - ReadTimeout //可使用setReadTimeout方法来进行设定
*
*
* @param string $name
* @param string $value
* @return bool
* @author zhiyong
*/
public function setHeader( $name , $value )
{
$name = trim($name);
if (!in_array(strtolower($name), FetchUrl::$disabledHeaders)) {
$this->headers_[$name] = $value;
return true;
} else {
trigger_error("Disabled FetchUrl Header:" . $name, E_USER_NOTICE);
return false;
}
}
/**
* 设置FetchUrl参数
*
* 参数列表:
* - truncated 布尔 是否截断
* - redirect 布尔 是否支持重定向
* - username 字符串 http认证用户名
* - password 字符串 http认证密码
* - useragent 字符串 自定义UA
*
* @param string $name
* @param string $value
* @return void
* @author Elmer Zhang
* @ignore
*/
public function setOpt( $name , $value )
{
$name = trim($name);
$this->opt_[$name] = $value;
}
/**
* 在发起的请求中,批量添加cookie数据
*
* @param array $cookies 要添加的Cookies,格式:array('key1' => 'value1', 'key2' => 'value2', ....)
* @return void
* @author zhiyong
*/
public function setCookies( $cookies = array() )
{
if ( is_array($cookies) and !empty($cookies) ) {
foreach ( $cookies as $k => $v ) {
$this->setCookie($k, $v);
}
}
}
/**
* 在发起的请求中,添加cookie数据,此函数可多次调用,添加多个cookie
*
* @param string $name
* @param string $value
* @return void
* @author zhiyong
*/
public function setCookie( $name , $value )
{
$name = trim($name);
array_push($this->cookies_, "$name=$value");
}
/**
* 是否允许截断,默认为不允许
*
* 如果设置为true,当发送数据超过允许大小时,自动截取符合大小的部分;<br />
* 如果设置为false,当发送数据超过允许大小时,直接返回false;
*
* @param bool $allow
* @return void
* @author zhiyong
*/
public function setAllowTrunc($allow) {
$this->opt_["truncated"] = $allow;
}
/**
* 设置连接超时时间,此时间必须小于SAE系统设置的时间,否则以SAE系统设置为准(默认为5秒)
*
* @param int $ms 毫秒
* @return void
* @author zhiyong
*/
public function setConnectTimeout($ms) {
$this->opt_["connecttimeout"] = $ms;
}
/**
* 设置发送超时时间,此时间必须小于SAE系统设置的时间,否则以SAE系统设置为准(默认为20秒)
*
* @param int $ms 毫秒
* @return void
* @author zhiyong
*/
public function setSendTimeout($ms) {
$this->opt_["sendtimeout"] = $ms;
}
/**
* 设置读取超时时间,此时间必须小于SAE系统设置的时间,否则以SAE系统设置为准(默认为60秒)
*
* @param int $ms 毫秒
* @return void
* @author zhiyong
*/
public function setReadTimeout($ms) {
$this->opt_["ReadTimeout"] = $ms;
}
/**
* 当请求页面是转向页时,是否允许跳转,SAE最大支持5次跳转(默认不跳转)
*
* @param bool $allow 是否允许跳转。true:允许,false:禁止,默认为true
* @return void
* @author zhiyong
*/
public function setAllowRedirect($allow = true) {
$this->opt_["redirect"] = $allow;
}
/**
* 设置HTTP认证用户名密码
*
* @param string $username HTTP认证用户名
* @param string $password HTTP认证密码
* @return void
* @author zhiyong
*/
public function setHttpAuth($username, $password) {
$this->opt_["username"] = $username;
$this->opt_["password"] = $password;
}
/**
* 发起请求
*
* <code>
* <?php
* echo "Use callback function\n";
*
* function demo($content) {
* echo strtoupper($content);
* }
*
* $furl = new SaeFetchurl();
* $furl->fetch($url, $opt, 'demo');
*
* echo "Use callback class\n";
*
* class Ctx {
* public function demo($content) {
* $this->c .= $content;
* }
* public $c;
* };
*
* $ctx = new Ctx;
* $furl = new SaeFetchurl();
* $furl->fetch($url, $opt, array($ctx, 'demo'));
* echo $ctx->c;
* ?>
* </code>
*
* @param string $url
* @param array $opt 请求参数,格式:array('key1'=>'value1', 'key2'=>'value2', ... )。参数列表:
* - truncated 布尔 是否截断
* - redirect 布尔 是否支持重定向
* - username 字符串 http认证用户名
* - password 字符串 http认证密码
* - useragent 字符串 自定义UA
* @param callback $callback 用来处理返回的数据的函数。可以为函数名或某个实例对象的方法。
* @return string 成功时读取到的内容,否则返回false
* @author zhiyong
*/
public function fetch( $url, $opt = NULL, $callback=NULL )
{
if (count($this->cookies_) != 0) {
$this->opt_["cookie"] = join("; ", $this->cookies_);
}
$opt = ($opt) ? array_merge($this->opt_, $opt) : $this->opt_;
return $this->impl_->fetch($url, $opt, $this->headers_, $callback);
}
/**
* 返回数据的header信息
*
* @param bool $parse 是否解析header,默认为true。
* @return array
* @author zhiyong
*/
public function responseHeaders($parse = true)
{
$items = explode("\r\n", $this->impl_->headerContent());
if (!$parse) {
return $items;
}
array_shift($items);
$headers = array();
foreach ($items as $_) {
$pos = strpos($_, ":");
$key = trim(substr($_, 0, $pos));
$value = trim(substr($_, $pos + 1));
if ($key == "Set-Cookie") {
if (array_key_exists($key, $headers)) {
array_push($headers[$key], trim($value));
} else {
$headers[$key] = array(trim($value));
}
} else {
$headers[$key] = trim($value);
}
}
return $headers;
}
/**
* 返回HTTP状态码
*
* @return int
* @author Elmer Zhang
*/
public function httpCode() {
return $this->impl_->httpCode();
}
/**
* 返回网页内容
* 常用于fetch()方法返回false时
*
* @return string
* @author Elmer Zhang
*/
public function body() {
return $this->impl_->body();
}
/**
* 返回头里边的cookie信息
*
* @param bool $all 是否返回完整Cookies信息。为true时,返回Cookie的name,value,path,max-age,为false时,只返回Cookies的name, value
* @return array
* @author zhiyong
*/
public function responseCookies($all = true)
{
$header = $this->impl_->headerContent();
$matchs = array();
$cookies = array();
$kvs = array();
if (preg_match_all('/Set-Cookie:\s([^\r\n]+)/i', $header, $matchs)) {
foreach ($matchs[1] as $match) {
$cookie = array();
$items = explode(";", $match);
foreach ($items as $_) {
$item = explode("=", trim($_));
$cookie[$item[0]]= $item[1];
}
array_push($cookies, $cookie);
$kvs = array_merge($kvs, $cookie);
}
}
if ($all) {
return $cookies;
} else {
unset($kvs['path']);
unset($kvs['max-age']);
return $kvs;
}
}
/**
* 返回错误码
*
* @return int
* @author zhiyong
*/
public function errno()
{
if ($this->impl_->errno() != 0) {
return $this->impl_->errno();
} else {
if ($this->impl_->httpCode() != 200) {
return $this->impl_->httpCode();
}
}
return 0;
}
/**
* 返回错误信息
*
* @return string
* @author zhiyong
*/
public function errmsg()
{
if ($this->impl_->errno() != 0) {
return $this->impl_->error();
} else {
if ($this->impl_->httpCode() != 200) {
return $this->impl_->httpDesc();
}
}
return "";
}
/**
* 将对象的数据重新初始化,用于多次重用一个SaeFetchurl对象
*
* @return void
* @author Elmer Zhang
*/
public function clean() {
$this->__construct();
}
/**
* 开启/关闭调试模式
*
* @param bool $on true:开启调试;false:关闭调试
* @return void
* @author Elmer Zhang
*/
public function debug($on) {
if ($on) {
$this->impl_->setDebugOn();
} else {
$this->impl_->setDebugOff();
}
}
private $impl_;
private $opt_;
private $headers_;
}
/**
* FetchUrl , the sub class of SaeFetchurl
*
*
* @package sae
* @subpackage fetchurl
* @author zhiyong
* @ignore
*/
class FetchUrl {
const end_ = "http://fetchurl.sae.sina.com.cn/" ;
const maxRedirect_ = 5;
public static $disabledHeaders = array(
'content-length',
'host',
'vary',
'via',
'x-forwarded-for',
'fetchurl',
'accesskey',
'timestamp',
'signature',
'allowtruncated',
'connecttimeout',
'sendtimeout',
'readtimeout',
);
public function __construct($accesskey, $secretkey) {
$accesskey = trim($accesskey);
$secretkey = trim($secretkey);
$this->accesskey_ = $accesskey;
$this->secretkey_ = $secretkey;
$this->errno_ = 0;
$this->error_ = null;
$this->debug_ = false;
}
public function __destruct() {
// do nothing
}
public function setAccesskey($accesskey) {
$accesskey = trim($accesskey);
$this->accesskey_ = $accesskey;
}
public function setSecretkey($secretkey) {
$secretkey = trim($secretkey);
$this->secretkey_ = $secretkey;
}
public function setDebugOn() {
$this->debug_ = true;
}
public function setDebugOff() {
$this->debug_ = false;
}
public function fetch($url, $opt = null, $headers = null, $callback = null) {
$url = trim($url);
if (substr($url, 0, 7) != 'http://' && substr($url, 0, 8) != 'https://') {
$url = 'http://' . $url;
}
$this->callback_ = $callback;
$maxRedirect = FetchUrl::maxRedirect_;
if (is_array($opt) && array_key_exists('redirect',$opt) && !$opt['redirect']) {
$maxRedirect = 1;
}
for ($i = 0; $i < $maxRedirect; ++$i) {
$this->dofetch($url, $opt, $headers);
if ($this->errno_ == 0) {
if ($this->httpCode_ == 301 || $this->httpCode_ == 302) {
$matchs = array();
if (preg_match('/Location:\s([^\r\n]+)/i', $this->header_, $matchs)) {
$newUrl = $matchs[1];
// if new domain
if (strncasecmp($newUrl, "http://", strlen("http://")) == 0) {
$url = $newUrl;
} else {
$url = preg_replace('/^((?:https?:\/\/)?[^\/]+)\/(.*)$/i', '$1', $url) . "/". $newUrl;
}
if ($this->debug_) {
echo "[debug] redirect to $url\n";
}
continue;
}
}
}
break;
}
if ($this->errno_ == 0 && $this->httpCode_ == 200) {
return $this->body_;
} else {
return false;
}
}
public function headerContent() {
return $this->header_;
}
public function errno() {
return $this->errno_;
}
public function error() {
return $this->error_;
}
public function httpCode() {
return $this->httpCode_;
}
public function body() {
return $this->body_;
}
public function httpDesc() {
return $this->httpDesc_;
}
private function signature($url, $timestamp) {
$content = "FetchUrl" . $url .
"TimeStamp" . $timestamp .
"AccessKey" . $this->accesskey_;
$signature = (base64_encode(hash_hmac('sha256',$content,$this->secretkey_,true)));
if ($this->debug_) {
echo "[debug] content: $content" . "\n";
echo "[debug] signature: $signature" . "\n";
}
return $signature;
}
// we have to set wirteBody & writeHeader public
// for we used them in curl_setopt()
public function writeBody($ch, $body) {
if ($this->callback_) {
call_user_func($this->callback_, $body);
} else {
$this->body_ .= $body;
}
if ($this->debug_) {
echo "[debug] body => $body";
}
return strlen($body);
}
public function writeHeader($ch, $header) {
$this->header_ .= $header;
if ($this->debug_) {
echo "[debug] header => $header";
}
return strlen($header);
}
private function dofetch($url, $opt, $headers_) {
$this->header_ = $this->body_ = null;
$headers = array();
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch,CURLOPT_SSL_VERIFYPEER,false) ;
curl_setopt($ch,CURLOPT_SSL_VERIFYHOST,true) ;
curl_setopt($ch, CURLOPT_WRITEFUNCTION, array($this, 'writeBody'));
curl_setopt($ch, CURLOPT_HEADERFUNCTION, array($this, 'writeHeader'));
if ($this->debug_) {
curl_setopt($ch, CURLOPT_VERBOSE, true);
}
if (is_array($opt) && !empty($opt)) {
foreach( $opt as $k => $v) {
switch(strtolower($k)) {
case 'username':
if (array_key_exists("password",$opt)) {
curl_setopt($ch, CURLOPT_USERPWD, $v . ":" . $opt["password"]);
}
break;
case 'password':
if (array_key_exists("username",$opt)) {
curl_setopt($ch, CURLOPT_USERPWD, $opt["username"] . ":" . $v);
}
break;
case 'useragent':
curl_setopt($ch, CURLOPT_USERAGENT, $v);
break;
case 'post':
curl_setopt($ch, CURLOPT_POSTFIELDS, $v);
break;
case 'cookie':
curl_setopt($ch, CURLOPT_COOKIESESSION, true);
curl_setopt($ch, CURLOPT_COOKIE, $v);
break;
case 'multipart':
if ($v) array_push($headers, "Content-Type: multipart/form-data");
break;
case 'truncated':
array_push($headers, "AllowTruncated:" . $v);
break;
case 'connecttimeout':
array_push($headers, "ConnectTimeout:" . intval($v));
break;
case 'sendtimeout':
array_push($headers, "SendTimeout:" . intval($v));
break;
case 'readtimeout':
array_push($headers, "ReadTimeout:" . intval($v));
break;
default:
break;
}
}
}
if (isset($opt['method'])) {
if (strtolower($opt['method']) == 'get') {
curl_setopt($ch, CURLOPT_HTTPGET, true);
}
}
if (is_array($headers_) && !empty($headers_)) {
foreach($headers_ as $k => $v) {
if (!in_array(strtolower($k), FetchUrl::$disabledHeaders)) {
array_push($headers, "{$k}:" . $v);
}
}
}
curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
curl_setopt($ch, CURLOPT_ENCODING, "");
curl_setopt($ch, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_0);
curl_setopt($ch, CURLINFO_HEADER_OUT, true);
curl_exec($ch);
$info = curl_getinfo($ch);
if ($this->debug_) {
echo "[debug] curl_getinfo => " . print_r($info, true) . "\n";
}
$this->errno_ = curl_errno($ch);
$this->error_ = curl_error($ch);
if ($this->errno_ == 0) {
$matchs = array();
if (preg_match('/^(?:[^\s]+)\s([^\s]+)\s([^\r\n]+)/', $this->header_, $matchs)) {
$this->httpCode_ = $matchs[1];
$this->httpDesc_ = $matchs[2];
if ($this->debug_) {
echo "[debug] httpCode = " . $this->httpCode_ . " httpDesc = " . $this->httpDesc_ . "\n";
}
} else {
$this->errno_ = -1;
$this->error_ = "invalid response";
}
}
curl_close($ch);
}
private $accesskey_;
private $secretkey_;
private $errno_;
private $error_;
private $httpCode_;
private $httpDesc_;
private $header_;
private $body_;
private $debug_;
}
|