PHP采集有声小说

2016-1-1 海滨 程序就是一个世界

关键代码实现

去年暑假写的,主要是给站长做采集用的,另外给大家两个网页采集相关的类,一个是网页爬虫crack,一个是网页DOM解析器phpQuery。

caiji.php

<?php
	set_time_limit(0);
	header("Content-Type:text/html;charset=gb2312");
	header("Cache-Control: no-cache, must-revalidate");
	header("Pragma: no-cache");
	//include_once 'QueryList/phpQuery/phpQuery.php';
	include 'phpQuery/phpQuery.php';
	function getPostCon($url,$post_file='',$cookie_file='',$isgzip='',$is_show_header=false,$refer='',$ip=''){
		$ch = curl_init($url);
		// 2. 设置选项,包括URL
		curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; MSIE 10.0; Windows Phone 8.0; Trident/6.0; IEMobile/10.0; ARM; Touch; NOKIA; Lumia 520)');  
		curl_setopt($ch, CURLOPT_HEADER,$is_show_header); //返回header部分 若设置成不返回就设置成0
		if(!empty($cookie_file)){
			curl_setopt($ch, CURLOPT_COOKIEJAR,$cookie_file); //存储cookies
		}
		$header[0]='Accept:*/*';
		$header[]='Accept-Encoding:gzip, deflate';
		$header[]='Accept-Language:zh-CN,zh;q=0.8';
		$header[]='Cache-Control:no-cache';
		$header[]='Pragma:no-cache';
		//$header[]='X-Requested-With:XMLHttpRequest';
		curl_setopt($ch, CURLOPT_HTTPHEADER,$header);
		curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); // 使用自动跳转
		if(!empty($isgzip)){
			curl_setopt($ch, CURLOPT_ENCODING,'gzip,deflate');//这个是解释gzip内容
		}
		if($ip){
			$header[]='X-FORWARDED-FOR:'.$ip;
			$header[]='CLIENT-IP:'.$ip;
		}
		if($refer){
			curl_setopt($ch,CURLOPT_REFERER,$refer);// 在HTTP请求头中"Referer: "的内容。
		}else{
			curl_setopt($ch,CURLOPT_REFERER,$url);// 在HTTP请求头中"Referer: "的内容。
		}
		
		if(!empty($post_file)){
			curl_setopt($ch,CURLOPT_POST,1);	/*如果你想PHP去做一个正规的HTTP POST,设置这个选  
				项为一个非零值。这个POST是普通的 application/x-www-from-urlencoded 类型,多数被HTML表单使用。*/
			curl_setopt($ch,CURLOPT_POSTFIELDS,$post_file);  //传递一个作为HTTP "POST"操作的所有数据的字符串。
		}
		curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);  //返回http响应结果 返回字符串,而非直接输出
		curl_setopt($ch,CURLOPT_TIMEOUT,30);//超时
		// 3. 执行并获取HTML文档内容
		$output = curl_exec($ch);
		if ($output === FALSE) {
			echo "cURL Error:".curl_error($ch);
		}
		curl_close($ch);
		return $output;
	}
	function getCookie($con){
		$regex='/Set-Cookie:(.*?);/i';
		preg_match($regex,$con,$m);
		return empty($m[1])? null : $m[1];
	}
	function http_build_url($url_arr){
		$new_url = $url_arr['scheme'] . "://".$url_arr['host'];
		if(!empty($url_arr['port']))
			$new_url = $new_url.":".$url_arr['port'];
		$new_url = $new_url . $url_arr['path'];
		if(!empty($url_arr['query']))
			$new_url = $new_url . "?" . $url_arr['query'];
		if(!empty($url_arr['fragment']))
			$new_url = $new_url . "#" . $url_arr['fragment'];
		return $new_url;
	}
	function getReUrl($url){
		$url_arr=parse_url($url);
		if(!empty($url_arr['path'])){
			$url_path=explode('/',$url_arr['path']);
			array_pop($url_path);
			$url_path=implode('/',$url_path).'/';
			$url_arr['path']=$url_path;
		}
		$url=http_build_url($url_arr);
		return $url;
	}
	function GetIP(){
		if(!empty($_SERVER["HTTP_CLIENT_IP"])){
			$cip = $_SERVER["HTTP_CLIENT_IP"];
		}
		elseif(!empty($_SERVER["HTTP_X_FORWARDED_FOR"])){
			$cip = $_SERVER["HTTP_X_FORWARDED_FOR"];
		}
		elseif(!empty($_SERVER["REMOTE_ADDR"])){
			$cip = $_SERVER["REMOTE_ADDR"];
		}
		else{
			$cip = "";
		}
		return $cip;
	}
	$ip=GetIP();
	$urls=$_POST['url'];
	$urls=explode("\r\n",$urls);
	$count=0;
	$con='
		<script type="text/javascript" src="jquery.min.js"></script>
		从第<input type="test" id="eq" >集开始:
		<button id="start">启动采集</button>
		<ul id="downlist">';
	$input_text="";
	//$url='http://www.tingchina.com/yousheng/22132/play_22132_563.htm';
	foreach($urls as $url){
		$Pagecon=getPostCon($url,'','',1,false,'',$ip);
		$regex='/class=\"b2\"><a href=\"(.*?)\".*?>(.*?)<\/a>/i';
		preg_match_all($regex,$Pagecon,$m);
		unset($m[0]);
		if(!empty($m[1]) && !empty($m[2])){
			$names=$m[1];
			$links=$m[2];
		}else{
			exit('规则失效了');
		}
		$url_re=getReUrl($url);
		phpQuery::newDocumentCon($Pagecon);
		$textfile=mb_convert_encoding(pq("span.red12>strong")->text(),'gb2312','utf-8');
		if(empty($textfile)){
			$regex='/s=\"red12\"><strong>(.*?)<\/strong>/';
			preg_match($regex,$Pagecon,$_m);
			$textfile=$_m[1];
		}
		$link_arr=array();
		foreach($links as $k=>$v){
			$arr['name']=$v;
			$href=$names[$k];
			$arr['href']=$url_re.basename($href);
			$link_arr[]=$arr;	
		}
		foreach($link_arr as $v=>$k){
			$input_text.=str_ireplace('.mp3','',$k['name']).'$http://localhost/caiji/'.$textfile.'/'.$k['name'].'$flv'."\r\n";
			$con.=<<<HTML
			<li><span>{$count}</span><a href="getMp3.php?url={$k['href']}&textfile={$textfile}" target="_blank">{$k['name']}</a><span class="flag"></span></li>
		
HTML;
		$count++;
		}
		$con.='<textarea cols="1000" rows="500">'.$input_text.'</textarea>';
	}
	$con.='</ul>';
	echo $con;
?>
<script type="text/javascript">
	$(function(){
				var eq=0;
		var links=$("ul#downlist>li>a");
		var length=links.length;
		var time;
		var i=0;
		$("#start").click(function(event) {
			if($("#eq").val()!==""){
				eq=parseInt($("#eq").val());
				$("#eq").val('');
			}
			var url_arr=links.eq(eq);
			var url=url_arr.attr('href');
			var flag=url_arr.next(".flag");
			flag.html('下载中');
			if(eq<length){
				clearInterval(time);
				$.get(url, function(data) {
					//如果没有成功下载就继续
					if(data.indexOf("下载成功")==-1 && data.indexOf("已经存在")==-1){
						eq--;
					}
					flag.html(data);
					setTimeout(function(){
						$("#start").click();
					},200);
					time=setInterval(function(){
						i++;
						if(i>30 && !data){
							eq++;
							$("#start").click();
							i=0;
						}	
					},1*1000);
				});
				
				//上面是30是30秒后自动采集下一个链接,注意不要刷新页面,30可以根据自己的服务器带宽而定
			}
			eq++;
			event.preventDefault();
			return false;
		});
	})
</script>


getMp3.php


<?php
	header("Content-type:text/html;charset=gb2312");
	header("Cache-Control: no-cache, must-revalidate");
	header("Pragma: no-cache");
	function DownFile($url){
		$ch = curl_init($url);
		curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36');  
		curl_setopt($ch, CURLOPT_HEADER,0);
		curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
		$output = curl_exec($ch);
		if ($output === FALSE) {
			echo "cURL Error:".curl_error($ch);
		}
		curl_close($ch);
		return $output;
	}
	function getPostCon($url,$post_file='',$cookie_file='',$isgzip='',$is_show_header=false,$refer='',$ip=''){
		$ch = curl_init($url);
		// 2. ÉèÖÃÑ¡Ï°üÀ¨URL
		curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36');  
		curl_setopt($ch, CURLOPT_HEADER,$is_show_header); //·µ»Øheader²¿·Ö ÈôÉèÖóɲ»·µ»Ø¾ÍÉèÖóÉ0
		if(!empty($cookie_file)){
			curl_setopt($ch, CURLOPT_COOKIEJAR,$cookie_file); //´æ´¢cookies
		}
		$header[0]='text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8';
		$header[]='Accept-Encoding:gzip, deflate';
		$header[]='Accept-Language:zh-CN,zh;q=0.8';
		$header[]='Cache-Control:no-cache';
		$header[]='Pragma:no-cache';
		
		
		curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); // ʹÓÃ×Ô¶¯Ìøת
		if(!empty($isgzip)){
			curl_setopt($ch, CURLOPT_ENCODING,'gzip,deflate');//Õâ¸öÊǽâÊÍgzipÄÚÈÝ
		}
		if($ip){
			$header[]='X-FORWARDED-FOR:'.$ip;
			$header[]='CLIENT-IP:'.$ip;
		}
		if($refer){
			$header[]='Cookie:jieshaoren=0;  ASPSESSIONIDQSDTARCR=HPFGKHIAEPBKDHEIGDCNPHDA; t_play_mode=flash; t_play_url='.$refer;
			curl_setopt($ch,CURLOPT_REFERER,$refer);// ÔÚHTTPÇëÇóÍ·ÖÐ"Referer: "µÄÄÚÈÝ¡£
		}else{
			curl_setopt($ch,CURLOPT_REFERER,$url);// ÔÚHTTPÇëÇóÍ·ÖÐ"Referer: "µÄÄÚÈÝ¡£
		}
		curl_setopt($ch, CURLOPT_HTTPHEADER,$header);
		if(!empty($post_file)){
			curl_setopt($ch,CURLOPT_POST,1);	/*Èç¹ûÄãÏëPHPÈ¥×öÒ»¸öÕý¹æµÄHTTP POST£¬ÉèÖÃÕâ¸öÑ¡  
				ÏîΪһ¸ö·ÇÁãÖµ¡£Õâ¸öPOSTÊÇÆÕͨµÄ application/x-www-from-urlencoded ÀàÐÍ£¬¶àÊý±»HTML±íµ¥Ê¹Óá£*/
			curl_setopt($ch,CURLOPT_POSTFIELDS,$post_file);  //´«µÝÒ»¸ö×÷ΪHTTP "POST"²Ù×÷µÄËùÓÐÊý¾ÝµÄ×Ö·û´®¡£
		}
		curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);  //·µ»ØhttpÏìÓ¦½á¹û ·µ»Ø×Ö·û´®£¬¶ø·ÇÖ±½ÓÊä³ö
		curl_setopt($ch,CURLOPT_TIMEOUT,30);//³¬Ê±
		// 3. Ö´Ðв¢»ñÈ¡HTMLÎĵµÄÚÈÝ
		$output = curl_exec($ch);
		if ($output === FALSE) {
			echo "cURL Error:".curl_error($ch);
		}
		curl_close($ch);
		return $output;
	}
	function http_build_url($url_arr){
		$new_url = $url_arr['scheme'] . "://".$url_arr['host'];
		if(!empty($url_arr['port']))
			$new_url = $new_url.":".$url_arr['port'];
		$new_url = $new_url . $url_arr['path'];
		if(!empty($url_arr['query']))
			$new_url = $new_url . "?" . $url_arr['query'];
		if(!empty($url_arr['fragment']))
			$new_url = $new_url . "#" . $url_arr['fragment'];
		return $new_url;
	}
	function getReUrl($url){
		$url_arr=parse_url($url);
		if(!empty($url_arr['path'])){
			$url_path=explode('/',$url_arr['path']);
			array_pop($url_path);
			$url_path=implode('/',$url_path).'/';
			$url_arr['path']=$url_path;
		}
		$url=http_build_url($url_arr);
		return $url;
	}
	function GetIP(){
		if(!empty($_SERVER["HTTP_CLIENT_IP"])){
			$cip = $_SERVER["HTTP_CLIENT_IP"];
		}
		elseif(!empty($_SERVER["HTTP_X_FORWARDED_FOR"])){
			$cip = $_SERVER["HTTP_X_FORWARDED_FOR"];
		}
		elseif(!empty($_SERVER["REMOTE_ADDR"])){
			$cip = $_SERVER["REMOTE_ADDR"];
		}
		else{
			$cip = "";
		}
		return $cip;
	}
	$ip=GetIP();
	include 'phpQuery/phpQuery.php';
	$url=$_GET['url'];
	$con=file_get_contents($url);
	phpQuery::newDocumentCon($con);
	$src_temp=mb_convert_encoding(pq("iframe:[name=playmedia]")->attr('src'),'gb2312','utf-8');
	if(empty($src_temp)){
		$regex='/name=\"playmedia\".*?src=\"(.*?)\"/';
		preg_match($regex,$con,$m);
		$src_temp=$m[1];
	}
	$src='http://www.tingchina.com'.$src_temp;
	$con=getPostCon($src,null,null,1,0,$url,$ip);
	$regex='/\[(\d)\]= \"(.*?)\"\;/';
	preg_match_all($regex,$con,$m);
	$json_arr=array();
	if(!empty($m[2][0]) && !empty($m[2][2])){
		$url=$m[2][0].$m[2][2];
		//$url=mb_convert_encoding($url,'gb2312','utf-8');
		$url_arr=parse_url($url);
		unset($url_arr['query']);
		$url_arr=http_build_url($url_arr);
		$filename=basename($url_arr);
		$textfile=$_GET['textfile'];
		$file_path=$textfile.'/'.$filename;
		if(!is_dir($textfile)){
			$res=mkdir($textfile,0777,true);
			if(!$res){
				$json_arr['ok']=0;
				exit(json_encode($json_arr));
			}
		}
		$con=DownFile($url);
		if(!file_exists($file_path)){
			if(file_put_contents($file_path,$con)){
				$json_arr['ok']='1';
				$json_arr['msg']=$file_path.'ÏÂÔسɹ¦';
			}else{
				$json_arr['ok']='0';
			}
		}else{
			$json_arr['ok']='1';
			$json_arr['msg']=$file_path.'ÒѾ­´æÔÚ';
		}
	}else{
		$json_arr['ok']='0';
	}
	if($json_arr['ok']=='0'){
		echo 'ÏÂÔØʧ°Ü';
	}else{
		echo $json_arr['msg'];
	}
?>


这是下载地址

caiji1.rar


发表评论:


Powered by 海滨Blog
sitemap