抓取豆瓣电影数据以及分析数据程序

jackxiang 2008-9-12 16:28 | |
<?php
/*
form:1292275
to:  2000000
* */
error_reporting(E_ALL);
class snatch
{
  var $db_host="10.210.128.197";
  var $db_port="3600";
  var $db_name="app4";
    var $user="app4";
    var $password="f40RB12cbE4a";
    var $table="movie_info";
    var $path="./";//扫描的目录位置
  function _setvalue($path)
  {
    $this->path=$path;
  }

    function judge_return_info(&$all_film_info)//相当于日志和初始化的作用,同时检查数据是否合格和有漏补空,清NUll为空!
    {//补全数据库空白字段,以防发生插入错误,和Null值的发上
        $all_film_fild=array("0"=>"english_name","1"=>"chinese_name","2"=>"other_name","3"=>"playwright","4"=>"director","5"=>"main_performer","6"=>"play_year","7"=>"language","8"=>"country","9"=>"imdb","10"=>"introduce","11"=>"play_img","12"=>"tag");

        foreach($all_film_fild as $keys=> $fileds)
        {
            if(!isset($all_film_info[$fileds]))//检查抓取的东西的数据库对应项键值是否都存在
            {
                $all_film_info[$fileds]="";
                $empty_filds[]=$fileds;

            }
            if(is_array($all_film_info[$fileds]))
            {
                $all_film_info[$fileds]=serialize($all_film_info[$fileds]);
            }

        }
        return $empty_filds;


    }

  function opendir_analyse_insert()
  {
    $dir = opendir($this->path);//取得dir目录列表
    while (($file = readdir($dir)) !== false)//列出 images 目录中的文件
      {
      if(($file !=".")&&$file !="..")//防止.或者..
      {
          $all = $this->analyse($this->path,$file);//分析数据
                $empty_filds = $this->judge_return_info($all);//取地址提高效率
                $flag=$this->insert($all);//插入数据
                if($flag)
                {
                    echo "对文件    ".$file."   的分析完毕,发现   ".join(",",$empty_filds)."   等分析为空,    插入数据库成功!\n";
                }else{

                    echo "对文件    ".$file."   的分析完毕,发现    ".join(",",$empty_filds)."  等分析为空,    插入数据库失败!\n";
                }
                //print_r($all);
      }

      }
    closedir($dir);
  }
  function analyse($path,$file)// 分析返回要插入的资料,返回一个数组
  {
    $file_info=$path."/".$file;
    $str_temp = file_get_contents($file_info);
        $str_temp = str_replace("\n", "",$str_temp);
        preg_match("/.*<span[^p]*pl2[^>]*>([^<]*)<\/span>/i", $str_temp, $out);
        if($out==NULL)//假如是中文名称没有匹配到则把正常的<h1>***</h1>中的***为中文,特殊情况
        {
            preg_match("/<h1>[^<]*<\/h1>/i",$str_temp, $out5);
            //去掉<h1></h1>
            $str_temp1=strstr($out5[0],"<h1>");
            $position = strpos($out5[0],"</h1>");
            $real_postion=$position-4;
            $film_all_info["chinese_name"]=substr($out5[0],4,$real_postion);//4就是<h1>长度
            //print_r($out5);//英文名称


        }else//匹配到中文后,中文是中文,英文是英文,以下分别做处理
        {
            //var_dump($out);
            //print_r($out);//简体中文名 $out[1]
            $chinese_name_all = explode(":",$out[1]);
            $chinese_name=$chinese_name_all[1];
            $film_all_info['chinese_name']=$chinese_name;

            preg_match("/<h1>[^<]*<\/h1>/i",$str_temp, $out5);
            //去掉<h1></h1>
            $str_temp1=strstr($out5[0],"<h1>");
            $position = strpos($out5[0],"</h1>");
            $real_postion=$position-4;
            $film_all_info["english_name"]=substr($out5[0],4,$real_postion);//4就是<h1>长度
            //print_r($out5);//英文名称
        }

        $movie_filed=array("0"=>"编剧","1"=>"导演","2"=>"主演","3"=>"上映年度","4"=>"制片国家/地区","5"=>"又名","6"=>"imdb链接","7"=>"语言");
        $dba_film_fild=array("0"=>"playwright","1"=>"director","2"=>"main_performer","3"=>"play_year","4"=>"country","5"=>"other_name","6"=>"imdb","7"=>"language");//每一个键值对应的电影名称通过 array_search()来对应


        preg_match_all("/<span[^>]*pl\"[^>]*>(.*?)<\/span>(.*?)<br\/>/i",$str_temp, $out3);
        //print_r($out3[1]);//导演,编剧主演等信息
        //print_r($out3[2]);//导演,编剧主演等详细信息
        //print_r($out3);

        foreach($out3[1] as $key=>$value)
        {

            $trimmed = trim($value, ": <br>");//去掉又名的:
            $trimmed = trim($trimmed, ":");//去掉其他的:
            /*
                 $key = array_search("制片国家/地区",$movie_filed);
                 echo $key;
             */
            $pos = array_search($trimmed,$movie_filed);//html中的其中一个元素和我们的movie_filed对照看是否在
            if($pos != NULL)
            {
                //html中的一个元素在html数组中的键值key,自己定义的键值为pos
                if(!empty($out3[2][$key]))
                {

                    //echo $dba_film_fild[$pos]."==>".strip_tags($out3[2][$key])."<br>";
                    //数据库名称==>对应分析数据
                    if(strstr(strip_tags($out3[2][$key]),"/") != false)
                    {
                         $out3[2][$key] = trim($out3[2][$key], ":");//去掉最前面的:号
                         $out3[2][$key]=str_replace(array(":"),"",strip_tags($out3[2][$key]));
                         $film_all_info[$dba_film_fild[$pos]]=explode(" /",strip_tags($out3[2][$key]));
                    }else{
                            $film_all_info[$dba_film_fild[$pos]]=strip_tags($out3[2][$key]); //print_r($out3[2]);//导演,编剧主演等信息
                         }


                }
                //插入数据库字段对应$dba_film_fild[$key]

            }

            //对imdb单独处理,由正则取imdb码

                $imdb_judge = strstr($value, 'imdb');
                if($imdb_judge!=false)
                {

                    $pat = '/<a(.*?)href="(.*?)"(.*?)>(.*?)<\/a>/i';//匹配超链接正则
                    preg_match_all($pat, $imdb_judge, $m);
                    $film_all_info["imdb"] =  $m[4][0];
                }
        }




        preg_match("/otho\.douban\.com\/mpic\/([^\"]*)\.jpg\"/i",$str_temp, $out4);
        $film_all_info["play_img"]=$out4[1];
        //print_r($out4);图片



        preg_match("/<div class=\"related_info\">.*?<div class=\"indent\">(.*?)<br\/>/i",$str_temp, $out6);
        $film_all_info["introduce"]=$out6[1];
        //print_r($out6);//剧情简介



        preg_match("/<div class=\"indent\"><a href=\"\/movie\/tag\/.+<div class=\"obss\">/i",$str_temp, $out7);
        if(!empty($out7))//如果有标签不为空
        {
            $pat = '/<a(.*?)href="(.*?)"(.*?)>(.*?)<\/a>/i';//匹配超链接正则
            preg_match_all($pat,  $out7[0], $tag);
            //echo $out7[0];//匹配标签
            $film_all_info["tag"]=$tag[4];
            //print_r($tag[4]);
            //print_r($film_all_info);
        }



        return $film_all_info;



  }
  function insert($all_film_info)//插入数据库
  {
    $dsn="mysql:host=".$this->db_host.";port=".$this->db_port.";dbname=".$this->db_name;
        $db = new PDO($dsn, $this->user, $this->password);
        //print_r($all_film_info);

        foreach($all_film_info as $key=>$values)
        {
            $fileds[]=$key;
            $content[]="'".mysql_escape_string($values)."'";//以防相关的危害串mysql_escape_string
        }
        $filed_separated = join(",", $fileds);
        $fild_content = join(",", $content);
        $query="insert into `".$this->table."` (".$filed_separated.") values (".$fild_content.")";
        $insert_flag = $db->exec($query);
        if($insert_flag)
        {
            return 1;
        }else{
                return 0;

             }



  }


  function  judge_is_film($content)//判断是否是电影
  {

    $flag1 = strstr($content, "导演");
    $flag2 = strstr($content, '编剧');
    $flag3 = strstr($content, '上映年度');
    $flag4 = strstr($content, '语言');
    $flag4 = strstr($content, '剧情简介');
    var_dump($flag1);
    if($flag1==false)
    {
      return 0;
    }

    if($flag2==false)
    {

      return 0;
    }

    if($flag3==false)
    {
      return 0;
    }

    if($flag4==false)
    {
      return 0;
    }
    return 1;
  }
}
?>
<?php
  $path="/home/xiangdong2/snatch/test"; //test后面不能有/号
  $snatch=new snatch();
  $snatch->_setvalue($path);
  $movie_info = $snatch->opendir_analyse_insert();

?>

作者:jackxiang@向东博客 专注WEB应用 构架之美 --- 构架之美,在于尽态极妍 | 应用之美,在于药到病除
地址:http://jackxiang.com/post/1253/
版权所有。转载时必须以链接形式注明作者和原始出处及本声明!


最后编辑: jackxiang 编辑于2010-9-13 15:16
评论列表
2011-5-17 02:00 | duanjianair
你好,看了你的文章我有一些问题想请教你,如果方便的话能交流一下吗?我的邮箱是:henansky@126.com,谢谢
分页: 1/1 第一页 1 最后页
发表评论

昵称

网址

电邮

打开HTML 打开UBB 打开表情 隐藏 记住我 [登入] [注册]