<?php
/*
form:1292275
to: 2000000
* */
error_reporting(E_ALL);
class snatch
{
var $db_host="10.210.128.197";
var $db_port="3600";
var $db_name="app4";
var $user="app4";
var $password="f40RB12cbE4a";
var $table="movie_info";
var $path="./";//扫描的目录位置
function _setvalue($path)
{
$this->path=$path;
}
function judge_return_info(&$all_film_info)//相当于日志和初始化的作用,同时检查数据是否合格和有漏补空,清NUll为空!
{//补全数据库空白字段,以防发生插入错误,和Null值的发上
$all_film_fild=array("0"=>"english_name","1"=>"chinese_name","2"=>"other_name","3"=>"playwright","4"=>"director","5"=>"main_performer","6"=>"play_year","7"=>"language","8"=>"country","9"=>"imdb","10"=>"introduce","11"=>"play_img","12"=>"tag");
foreach($all_film_fild as $keys=> $fileds)
{
if(!isset($all_film_info[$fileds]))//检查抓取的东西的数据库对应项键值是否都存在
{
$all_film_info[$fileds]="";
$empty_filds[]=$fileds;
}
if(is_array($all_film_info[$fileds]))
{
$all_film_info[$fileds]=serialize($all_film_info[$fileds]);
}
}
return $empty_filds;
}
function opendir_analyse_insert()
{
$dir = opendir($this->path);//取得dir目录列表
while (($file = readdir($dir)) !== false)//列出 images 目录中的文件
{
if(($file !=".")&&$file !="..")//防止.或者..
{
$all = $this->analyse($this->path,$file);//分析数据
$empty_filds = $this->judge_return_info($all);//取地址提高效率
$flag=$this->insert($all);//插入数据
if($flag)
{
echo "对文件 ".$file." 的分析完毕,发现 ".join(",",$empty_filds)." 等分析为空, 插入数据库成功!\n";
}else{
echo "对文件 ".$file." 的分析完毕,发现 ".join(",",$empty_filds)." 等分析为空, 插入数据库失败!\n";
}
//print_r($all);
}
}
closedir($dir);
}
function analyse($path,$file)// 分析返回要插入的资料,返回一个数组
{
$file_info=$path."/".$file;
$str_temp = file_get_contents($file_info);
$str_temp = str_replace("\n", "",$str_temp);
preg_match("/.*<span[^p]*pl2[^>]*>([^<]*)<\/span>/i", $str_temp, $out);
if($out==NULL)//假如是中文名称没有匹配到则把正常的<h1>***</h1>中的***为中文,特殊情况
{
preg_match("/<h1>[^<]*<\/h1>/i",$str_temp, $out5);
//去掉<h1></h1>
$str_temp1=strstr($out5[0],"<h1>");
$position = strpos($out5[0],"</h1>");
$real_postion=$position-4;
$film_all_info["chinese_name"]=substr($out5[0],4,$real_postion);//4就是<h1>长度
//print_r($out5);//英文名称
}else//匹配到中文后,中文是中文,英文是英文,以下分别做处理
{
//var_dump($out);
//print_r($out);//简体中文名 $out[1]
$chinese_name_all = explode(":",$out[1]);
$chinese_name=$chinese_name_all[1];
$film_all_info['chinese_name']=$chinese_name;
preg_match("/<h1>[^<]*<\/h1>/i",$str_temp, $out5);
//去掉<h1></h1>
$str_temp1=strstr($out5[0],"<h1>");
$position = strpos($out5[0],"</h1>");
$real_postion=$position-4;
$film_all_info["english_name"]=substr($out5[0],4,$real_postion);//4就是<h1>长度
//print_r($out5);//英文名称
}
$movie_filed=array("0"=>"编剧","1"=>"导演","2"=>"主演","3"=>"上映年度","4"=>"制片国家/地区","5"=>"又名","6"=>"imdb链接","7"=>"语言");
$dba_film_fild=array("0"=>"playwright","1"=>"director","2"=>"main_performer","3"=>"play_year","4"=>"country","5"=>"other_name","6"=>"imdb","7"=>"language");//每一个键值对应的电影名称通过 array_search()来对应
preg_match_all("/<span[^>]*pl\"[^>]*>(.*?)<\/span>(.*?)<br\/>/i",$str_temp, $out3);
//print_r($out3[1]);//导演,编剧主演等信息
//print_r($out3[2]);//导演,编剧主演等详细信息
//print_r($out3);
foreach($out3[1] as $key=>$value)
{
$trimmed = trim($value, ": <br>");//去掉又名的:
$trimmed = trim($trimmed, ":");//去掉其他的:
/*
$key = array_search("制片国家/地区",$movie_filed);
echo $key;
*/
$pos = array_search($trimmed,$movie_filed);//html中的其中一个元素和我们的movie_filed对照看是否在
if($pos != NULL)
{
//html中的一个元素在html数组中的键值key,自己定义的键值为pos
if(!empty($out3[2][$key]))
{
//echo $dba_film_fild[$pos]."==>".strip_tags($out3[2][$key])."<br>";
//数据库名称==>对应分析数据
if(strstr(strip_tags($out3[2][$key]),"/") != false)
{
$out3[2][$key] = trim($out3[2][$key], ":");//去掉最前面的:号
$out3[2][$key]=str_replace(array(":"),"",strip_tags($out3[2][$key]));
$film_all_info[$dba_film_fild[$pos]]=explode(" /",strip_tags($out3[2][$key]));
}else{
$film_all_info[$dba_film_fild[$pos]]=strip_tags($out3[2][$key]); //print_r($out3[2]);//导演,编剧主演等信息
}
}
//插入数据库字段对应$dba_film_fild[$key]
}
//对imdb单独处理,由正则取imdb码
$imdb_judge = strstr($value, 'imdb');
if($imdb_judge!=false)
{
$pat = '/<a(.*?)href="(.*?)"(.*?)>(.*?)<\/a>/i';//匹配超链接正则
preg_match_all($pat, $imdb_judge, $m);
$film_all_info["imdb"] = $m[4][0];
}
}
preg_match("/otho\.douban\.com\/mpic\/([^\"]*)\.jpg\"/i",$str_temp, $out4);
$film_all_info["play_img"]=$out4[1];
//print_r($out4);图片
preg_match("/<div class=\"related_info\">.*?<div class=\"indent\">(.*?)<br\/>/i",$str_temp, $out6);
$film_all_info["introduce"]=$out6[1];
//print_r($out6);//剧情简介
preg_match("/<div class=\"indent\"><a href=\"\/movie\/tag\/.+<div class=\"obss\">/i",$str_temp, $out7);
if(!empty($out7))//如果有标签不为空
{
$pat = '/<a(.*?)href="(.*?)"(.*?)>(.*?)<\/a>/i';//匹配超链接正则
preg_match_all($pat, $out7[0], $tag);
//echo $out7[0];//匹配标签
$film_all_info["tag"]=$tag[4];
//print_r($tag[4]);
//print_r($film_all_info);
}
return $film_all_info;
}
function insert($all_film_info)//插入数据库
{
$dsn="mysql:host=".$this->db_host.";port=".$this->db_port.";dbname=".$this->db_name;
$db = new PDO($dsn, $this->user, $this->password);
//print_r($all_film_info);
foreach($all_film_info as $key=>$values)
{
$fileds[]=$key;
$content[]="'".mysql_escape_string($values)."'";//以防相关的危害串mysql_escape_string
}
$filed_separated = join(",", $fileds);
$fild_content = join(",", $content);
$query="insert into `".$this->table."` (".$filed_separated.") values (".$fild_content.")";
$insert_flag = $db->exec($query);
if($insert_flag)
{
return 1;
}else{
return 0;
}
}
function judge_is_film($content)//判断是否是电影
{
$flag1 = strstr($content, "导演");
$flag2 = strstr($content, '编剧');
$flag3 = strstr($content, '上映年度');
$flag4 = strstr($content, '语言');
$flag4 = strstr($content, '剧情简介');
var_dump($flag1);
if($flag1==false)
{
return 0;
}
if($flag2==false)
{
return 0;
}
if($flag3==false)
{
return 0;
}
if($flag4==false)
{
return 0;
}
return 1;
}
}
?>
<?php
$path="/home/xiangdong2/snatch/test"; //test后面不能有/号
$snatch=new snatch();
$snatch->_setvalue($path);
$movie_info = $snatch->opendir_analyse_insert();
?>
/*
form:1292275
to: 2000000
* */
error_reporting(E_ALL);
class snatch
{
var $db_host="10.210.128.197";
var $db_port="3600";
var $db_name="app4";
var $user="app4";
var $password="f40RB12cbE4a";
var $table="movie_info";
var $path="./";//扫描的目录位置
function _setvalue($path)
{
$this->path=$path;
}
function judge_return_info(&$all_film_info)//相当于日志和初始化的作用,同时检查数据是否合格和有漏补空,清NUll为空!
{//补全数据库空白字段,以防发生插入错误,和Null值的发上
$all_film_fild=array("0"=>"english_name","1"=>"chinese_name","2"=>"other_name","3"=>"playwright","4"=>"director","5"=>"main_performer","6"=>"play_year","7"=>"language","8"=>"country","9"=>"imdb","10"=>"introduce","11"=>"play_img","12"=>"tag");
foreach($all_film_fild as $keys=> $fileds)
{
if(!isset($all_film_info[$fileds]))//检查抓取的东西的数据库对应项键值是否都存在
{
$all_film_info[$fileds]="";
$empty_filds[]=$fileds;
}
if(is_array($all_film_info[$fileds]))
{
$all_film_info[$fileds]=serialize($all_film_info[$fileds]);
}
}
return $empty_filds;
}
function opendir_analyse_insert()
{
$dir = opendir($this->path);//取得dir目录列表
while (($file = readdir($dir)) !== false)//列出 images 目录中的文件
{
if(($file !=".")&&$file !="..")//防止.或者..
{
$all = $this->analyse($this->path,$file);//分析数据
$empty_filds = $this->judge_return_info($all);//取地址提高效率
$flag=$this->insert($all);//插入数据
if($flag)
{
echo "对文件 ".$file." 的分析完毕,发现 ".join(",",$empty_filds)." 等分析为空, 插入数据库成功!\n";
}else{
echo "对文件 ".$file." 的分析完毕,发现 ".join(",",$empty_filds)." 等分析为空, 插入数据库失败!\n";
}
//print_r($all);
}
}
closedir($dir);
}
function analyse($path,$file)// 分析返回要插入的资料,返回一个数组
{
$file_info=$path."/".$file;
$str_temp = file_get_contents($file_info);
$str_temp = str_replace("\n", "",$str_temp);
preg_match("/.*<span[^p]*pl2[^>]*>([^<]*)<\/span>/i", $str_temp, $out);
if($out==NULL)//假如是中文名称没有匹配到则把正常的<h1>***</h1>中的***为中文,特殊情况
{
preg_match("/<h1>[^<]*<\/h1>/i",$str_temp, $out5);
//去掉<h1></h1>
$str_temp1=strstr($out5[0],"<h1>");
$position = strpos($out5[0],"</h1>");
$real_postion=$position-4;
$film_all_info["chinese_name"]=substr($out5[0],4,$real_postion);//4就是<h1>长度
//print_r($out5);//英文名称
}else//匹配到中文后,中文是中文,英文是英文,以下分别做处理
{
//var_dump($out);
//print_r($out);//简体中文名 $out[1]
$chinese_name_all = explode(":",$out[1]);
$chinese_name=$chinese_name_all[1];
$film_all_info['chinese_name']=$chinese_name;
preg_match("/<h1>[^<]*<\/h1>/i",$str_temp, $out5);
//去掉<h1></h1>
$str_temp1=strstr($out5[0],"<h1>");
$position = strpos($out5[0],"</h1>");
$real_postion=$position-4;
$film_all_info["english_name"]=substr($out5[0],4,$real_postion);//4就是<h1>长度
//print_r($out5);//英文名称
}
$movie_filed=array("0"=>"编剧","1"=>"导演","2"=>"主演","3"=>"上映年度","4"=>"制片国家/地区","5"=>"又名","6"=>"imdb链接","7"=>"语言");
$dba_film_fild=array("0"=>"playwright","1"=>"director","2"=>"main_performer","3"=>"play_year","4"=>"country","5"=>"other_name","6"=>"imdb","7"=>"language");//每一个键值对应的电影名称通过 array_search()来对应
preg_match_all("/<span[^>]*pl\"[^>]*>(.*?)<\/span>(.*?)<br\/>/i",$str_temp, $out3);
//print_r($out3[1]);//导演,编剧主演等信息
//print_r($out3[2]);//导演,编剧主演等详细信息
//print_r($out3);
foreach($out3[1] as $key=>$value)
{
$trimmed = trim($value, ": <br>");//去掉又名的:
$trimmed = trim($trimmed, ":");//去掉其他的:
/*
$key = array_search("制片国家/地区",$movie_filed);
echo $key;
*/
$pos = array_search($trimmed,$movie_filed);//html中的其中一个元素和我们的movie_filed对照看是否在
if($pos != NULL)
{
//html中的一个元素在html数组中的键值key,自己定义的键值为pos
if(!empty($out3[2][$key]))
{
//echo $dba_film_fild[$pos]."==>".strip_tags($out3[2][$key])."<br>";
//数据库名称==>对应分析数据
if(strstr(strip_tags($out3[2][$key]),"/") != false)
{
$out3[2][$key] = trim($out3[2][$key], ":");//去掉最前面的:号
$out3[2][$key]=str_replace(array(":"),"",strip_tags($out3[2][$key]));
$film_all_info[$dba_film_fild[$pos]]=explode(" /",strip_tags($out3[2][$key]));
}else{
$film_all_info[$dba_film_fild[$pos]]=strip_tags($out3[2][$key]); //print_r($out3[2]);//导演,编剧主演等信息
}
}
//插入数据库字段对应$dba_film_fild[$key]
}
//对imdb单独处理,由正则取imdb码
$imdb_judge = strstr($value, 'imdb');
if($imdb_judge!=false)
{
$pat = '/<a(.*?)href="(.*?)"(.*?)>(.*?)<\/a>/i';//匹配超链接正则
preg_match_all($pat, $imdb_judge, $m);
$film_all_info["imdb"] = $m[4][0];
}
}
preg_match("/otho\.douban\.com\/mpic\/([^\"]*)\.jpg\"/i",$str_temp, $out4);
$film_all_info["play_img"]=$out4[1];
//print_r($out4);图片
preg_match("/<div class=\"related_info\">.*?<div class=\"indent\">(.*?)<br\/>/i",$str_temp, $out6);
$film_all_info["introduce"]=$out6[1];
//print_r($out6);//剧情简介
preg_match("/<div class=\"indent\"><a href=\"\/movie\/tag\/.+<div class=\"obss\">/i",$str_temp, $out7);
if(!empty($out7))//如果有标签不为空
{
$pat = '/<a(.*?)href="(.*?)"(.*?)>(.*?)<\/a>/i';//匹配超链接正则
preg_match_all($pat, $out7[0], $tag);
//echo $out7[0];//匹配标签
$film_all_info["tag"]=$tag[4];
//print_r($tag[4]);
//print_r($film_all_info);
}
return $film_all_info;
}
function insert($all_film_info)//插入数据库
{
$dsn="mysql:host=".$this->db_host.";port=".$this->db_port.";dbname=".$this->db_name;
$db = new PDO($dsn, $this->user, $this->password);
//print_r($all_film_info);
foreach($all_film_info as $key=>$values)
{
$fileds[]=$key;
$content[]="'".mysql_escape_string($values)."'";//以防相关的危害串mysql_escape_string
}
$filed_separated = join(",", $fileds);
$fild_content = join(",", $content);
$query="insert into `".$this->table."` (".$filed_separated.") values (".$fild_content.")";
$insert_flag = $db->exec($query);
if($insert_flag)
{
return 1;
}else{
return 0;
}
}
function judge_is_film($content)//判断是否是电影
{
$flag1 = strstr($content, "导演");
$flag2 = strstr($content, '编剧');
$flag3 = strstr($content, '上映年度');
$flag4 = strstr($content, '语言');
$flag4 = strstr($content, '剧情简介');
var_dump($flag1);
if($flag1==false)
{
return 0;
}
if($flag2==false)
{
return 0;
}
if($flag3==false)
{
return 0;
}
if($flag4==false)
{
return 0;
}
return 1;
}
}
?>
<?php
$path="/home/xiangdong2/snatch/test"; //test后面不能有/号
$snatch=new snatch();
$snatch->_setvalue($path);
$movie_info = $snatch->opendir_analyse_insert();
?>
作者:jackxiang@向东博客 专注WEB应用 构架之美 --- 构架之美,在于尽态极妍 | 应用之美,在于药到病除
地址:http://jackxiang.com/post/1253/
版权所有。转载时必须以链接形式注明作者和原始出处及本声明!
最后编辑: jackxiang 编辑于2010-9-13 15:16
评论列表
2011-5-17 02:00 | duanjianair
你好,看了你的文章我有一些问题想请教你,如果方便的话能交流一下吗?我的邮箱是:henansky@126.com,谢谢
分页: 1/1
1

