最简单的就是 file_get_contents 其次是curl。
curl 我写个简单的例子,支持https:
function curl_file_get_contents($url, $params = array(), $is_post = false, $time_out = 20, $header=array())
{
$ca=getcwd()."/cacert.pem";
$isSsl=substr($url,0,8)=="https://"?true:false;
$ch = curl_init();//初始化curl
/*
$url_arr = parse_url($url);
$url_scheme=$url_arr["scheme"];
$url_host = $url_arr["host"];
$url_port = $url_arr["port"];
if(empty($url_port)){
$url_port=($url_scheme=='https')?'443':'80';
}
*/
//$url_ip = gethostbyname($url_host);
//if($url_host==$url_ip){
//$url_ip='127.0.0.1';
//}
//$url_nameresolve="{$url_host}:{$url_port}:{$url_ip}";
curl_setopt($ch, CURLOPT_URL, $url);//抓取指定网页
//curl_setopt($ch, CURLOPT_RESOLVE, [$url_nameresolve]);
//curl_setopt($ch, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_0); // 强制使用 HTTP/1.0
curl_setopt($ch, CURLOPT_IPRESOLVE, CURL_IPRESOLVE_V4); //强制使用IPv4
curl_setopt($ch, CURLOPT_HEADER, false);//设置是否返回response header
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);//要求结果为字符串且输出到屏幕上
curl_setopt($ch, CURLOPT_BINARYTRANSFER, true);
//当需要通过curl_getinfo来获取发出请求的header信息时,该选项需要设置为true
//curl_setopt($ch, CURLINFO_HEADER_OUT, true);
curl_setopt($ch, CURLOPT_TIMEOUT, $time_out);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $time_out);
curl_setopt($ch, CURLOPT_POST, $is_post);
//curl_setopt($ch, CURLOPT_SSLVERSION, 1);
//curl_setopt($ch, CURLOPT_PROXY, $proxy_host); //使用http代理
//curl_setopt($ch, CURLOPT_PROXYPORT, $proxy_port);
//curl_setopt($ch, CURLOPT_PROXYUSERPWD, "$proxy_user:$proxy_pwd");
if($isSsl){
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_CAINFO, $ca);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 2);
}
//curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
if ($is_post) {
curl_setopt($ch, CURLOPT_POSTFIELDS, $params);
curl_setopt($ch, CURLOPT_CUSTOMREQUEST, 'POST');
}else{
curl_setopt($ch, CURLOPT_CUSTOMREQUEST, 'GET');
}
if ($header) {
curl_setopt($ch, CURLOPT_HTTPHEADER, $header);
}
//$cfcuid='';
//if(isset($_COOKIE[session_name()])){
// curl_setopt($ch, CURLOPT_COOKIE, session_name()."=".$_COOKIE[session_name()].$cfcuid);
//}
//session_write_close(); //关键
$response = curl_exec($ch);
//打印请求的header信息
//$request_header = curl_getinfo( $ch, CURLINFO_HEADER_OUT);
//print_r($request_header);
curl_close($ch);
return $response;
}
//使用方法:
$fullurl="https://www.baidu.com";
$header=array();
$header[] = "User-Agent: custom bot";
$header[] = "Authorization: ok";
$header[] = "fromclient: php";
$header[] = "Expect:"; //post最大数据为1024,如果大于就要加下面这句话不然会报错或504
//$header[] = "Cookie: ".session_name()."=".$_COOKIE[session_name()];
$post_data = array(
'id' => 1
);
$response = curl_file_get_contents($fullurl,$post_data,true,20,$header);
//如果只是get
//$response = curl_file_get_contents($fullurl);
echo $response;
如果需要获取链接,图片,表格等数据那就需要使用snoopy
http://sourceforge.net/projects/snoopy
如果要像jquery一样的操作。可以采用 simple_html_dom
源码:
https://simplehtmldom.sourceforge.io/docs/1.9/
https://sourceforge.net/projects/simplehtmldom/
使用例子:
https://www.cnblogs.com/rockchip/p/3202552.html (推荐)
https://baijiahao.baidu.com/s?id=1714463047885529357
我写了一个例子:
include "simple_html_dom.php";
$fullurl="https://www.baidu.com";
$localurl="https://www.163.com";
$response = curl_file_get_contents($fullurl);
$html = str_get_html($response);
foreach($html->find('a') as $element){
//补全链接,将../bb/1.png改为https://www.aabb.com/aa/bb/1.png
$newurl=full_url_format($fullurl,$element->href);
//替换主机为www.163.com
$newurl=replaceHost($newurl,$localurl,'/subDir');
$element->href=$newurl;
}
$news=$html->find("div#news",0);
$newsHtml=$hotnews->innertext;
$news->save('test.html');
//上面用到两个自定义方法,后面会讲到
上面代码用到的full_url_format和replaceHost代码如下:
//替换主机名
function replaceHost($srcurl,$hostname,$rootPath=''){
//host or host:port
$hostinfo = parse_url($hostname);
$host=$hostname;
$port="";
if(isset($hostinfo['scheme'])) {
$host = $hostinfo['host'];
$port = empty($hostinfo['port'])?'':':'.$hostinfo['port'];
}
$srcinfo = parse_url($srcurl);
$fragment= empty($srcinfo['fragment'])?'':'#'.$srcinfo['fragment'];
$query= empty($srcinfo['query'])?'':'?'.$srcinfo['query'];
return $srcinfo['scheme'].'://'.$host.$port.$rootPath.$srcinfo['path'].$query.$fragment;
}
//将相对路径转成完整网址链接
function full_url_format($baseurl, $srcurl) {
$srcinfo = parse_url($srcurl);
if(isset($srcinfo['scheme'])) {
return $srcurl;
}
$baseinfo = parse_url($baseurl);
$port = empty($baseinfo['port'])?'':':'.$baseinfo['port'];
$url = $baseinfo['scheme'].'://'.$baseinfo['host'].$port;
if(!isset($srcinfo['path'])){
return $srcurl;
}
if(substr($srcinfo['path'], 0, 1) == '/') {
$path = $srcinfo['path'];
}else{
$path = dirname($baseinfo['path']).'/'.$srcinfo['path'];
}
$fragment= empty($srcinfo['fragment'])?'':'#'.$srcinfo['fragment'];
$query= empty($srcinfo['query'])?'':'?'.$srcinfo['query'];
//echo $fragment;
$rst = array();
$path_array = explode('/', $path);
//$base_array = explode('/', $baseinfo['path']);
if(!$path_array[0]) {
$rst[] = '';
}
foreach ($path_array AS $key => $dir) {
if ($dir == '..') {
if (end($rst) == '..') {
$rst[] = '..';
}elseif(!array_pop($rst)) {
$rst[] = '..';
}
}elseif($dir && $dir != '.') {
$rst[] = $dir;
}
}
if(!end($path_array)) {
$rst[] = '';
}
$url .= str_replace('\\', '/', implode('/', $rst)).$query.$fragment;
return $url;
}
以上足够使用。
对于https链接,snoopy和simple_html_dom默认是不支持的。可以采用curl