基于php编写的新闻类爬虫，插入WordPress数据库

这个爬虫写的比较久远，很久没有更新博客了。

1.首先思路是：通过php的curl_setopt()函数可以方便快捷的抓取网页。

2.什么样的新闻吸引人呢，当然的热点新闻了。这里选百度的搜索风云榜，获取热点关键词列表。

3.为了方便过滤，我们筛选搜狐的新闻。由于搜狐是通过搜狗搜索的新闻。所以把百度热点关键词通过搜狗一一搜索，打开对应的结果，筛选出搜狐的新闻链接。

4.进入搜狐新闻。获取新闻数据，进行内容筛选，重复过滤。

5.插入WordPress数据库，得到自己的新闻链接

6.自己的新闻链接主动提交给百度收录。

spider.class.php

 <?php

 //网页爬虫

 class spider{

     public $curl;

     public $timeout = 5;

     //尝试请求链接的时间

     public $data;

     public $fromUrl;

     //初始化

     public function __construct($url) {//构造函数

         $this -> fromUrl=$url;

         $this -> curl = curl_init();

         //相当于header里的Accept-Encoding>>防止乱码

         curl_setopt($this -> curl, CURLOPT_ENCODING, "");

         // 设置你需要抓取的URL

         curl_setopt($this -> curl, CURLOPT_URL, $url);

         // 设置header

         curl_setopt($this -> curl, CURLOPT_HEADER, 0);

         // 设置cURL 参数，要求结果保存到字符串中还是输出到屏幕上。如果希望获得内容但不输出,使用 CURLOPT_RETURNTRANSFER参数,并设为非0值/true!

         curl_setopt($this -> curl, CURLOPT_RETURNTRANSFER, 1);

         //参数CURLOPT_CONNECTTIMEOUT 通常用来设置curl尝试请求链接的时间

         curl_setopt($this -> curl, CURLOPT_CONNECTTIMEOUT, $this -> timeout);

         // CURLOPT_USERAGENT，它允许你自定义请求是的客户端名称，

         curl_setopt($this -> curl, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36');

         // 运行cURL，请求网页

         $this -> data = curl_exec($this -> curl);

         if($this -> data){

             $wcharset = preg_match("/<meta.+?charset=[^\w]?([-\w]+)/i",$this -> data,$temp) ? strtolower($temp[1]):"";//获取网页编码

             if($temp[1]!='utf-8' && $temp[1]!=''){

                 $this -> data = iconv($temp[1], "utf-8//IGNORE", $this -> data);//转换成utf8

             }

         }else{

             return FALSE;

         }

     }

 }

 ?>

mysql.class.php

 <?php

 //数据库操作

 header('Content-Type: text/html; charset=UTF-8');

 $conn = mysql_connect("10.10.10.10","","");

     mysql_select_db('cn_bjxxw_wenzhang');

     mysql_query("set names utf8");

     if (!$conn){

       die('数据库连接失败: ' . mysql_error());

     }

 class mysqlJi{//数据库

     static function mysqlQu($sql){//sql语句

         return mysql_query($sql);

     }

     static function mysqlFeAs($result){//遍历结果

         $i=0;//变量i

         while($row = mysql_fetch_assoc($result)){

         $reslist[$i] =$row;

         $i++;

         }

         return $reslist;

     }

     static function jsonEn($reslist){//输出json

         echo json_encode($reslist);

     }

     static function mysqlRows(){ //受影响的行数

         $info_str = mysql_info(); //函数返回最近一条查询的信息。如果成功，则返回有关该语句的信息，如果失败，则返回 false

         $a_rows = mysql_affected_rows(); //受影响的行数

         preg_match("([0-9]*)", $info_str, $r_matched);

         return ($a_rows < 1)?($r_matched[1]?$r_matched[1]:0):$a_rows; 

     }

     static function insertId(){//最后一条记录的ID

         $getID=mysql_insert_id();//$getID即为最后一条记录的ID

         return $getID;

     }

     static function mysqlCl(){//关闭数据库

         mysql_close();

     }

 }

 ?>

baiduPush.class.php

 <?php

 //百度收录主动提交

 class baiduPush{

     public $result;//返回结果

     public function __construct($urls,$api){//构造函数//(传递数组,api    )

         $ch = curl_init();

         $options =  array(

             CURLOPT_URL => $api,

             CURLOPT_POST => true,

             CURLOPT_RETURNTRANSFER => true,

             CURLOPT_POSTFIELDS => implode("\n", $urls),

             CURLOPT_HTTPHEADER => array('Content-Type: text/plain'),

         );

         curl_setopt_array($ch, $options);

         $this->result = curl_exec($ch);

     }

 }

 ?>

info.php

 <?php

 //本页为获取新闻详情页面

 include ("mysql.class.php");//引入数据库相关

 date_default_timezone_set("PRC");//时区

 include ("spider.class.php");

 include("baiduPush.class.php");

 class spider_cont extends spider{

     public $title = array();//被过滤后的标题

     public $stime = array();//被过滤后的时间

     public $screenData = array();//被过滤的内容

     public $classNew;//新闻分类

     //数据过滤(html标签,属性id class,属性值)//搜狐新闻

     function sohuCon($fenlei){

     if (preg_match('/top\-pager\-current/',$this->data,$if_page1)) {

         print "------数据放空";

     } else {

         preg_match('/<div[^>]*itemprop="articleBody"[^>]*>(.*?) seo/si',$this->data,$this->screenData);

         if($this->screenData[0]==''){

         //取出 div 标签且 id 为 contentText 的內容，并储存至二维数组 $screenData 中

             preg_match('/<div[^>]*id="contentText"[^>]*>(.*?) seo/si',$this->data,$this->screenData);

               if($this->screenData[0]==''){

                 preg_match('/<div[^>]*id="contentText"[^>]*>(.*?) -->/si',$this->data,$this->screenData);

                 preg_match_all('/<h1(.*?)>(.*?)<\/h1>/si',$this->data,$tit2);//标题

                 $this->title[0]=$tit2[0][1];

             }else{

                 preg_match('/<h1(.*?)>(.*?)<\/h1>/si',$this->data,$this->title);//标题

             }

         }else{

                 preg_match('/<h1(.*?)>(.*?)<\/h1>/si',$this->data,$this->title);//标题

         }

         //过滤标签

         $this->screenData[0]=preg_replace("/media_span_url\(\'(.*?)\'\)/si","",$this->screenData[0]); //过滤head标签 单独过滤

         $this->screenData[0]=$this->guolv($this->screenData[0], "<img><div><span><p>");    //内容过滤

         $this->title[0]=$this->guolv($this->title[0], "");    //标题过滤

         preg_match('/<span itemprop="name">(.*?)<\/span>/si',$this->data,$laiyuan);//来源

         $g_laiyuan=$this->guolv($laiyuan[0], "");    //来源过滤

         if($this->screenData[0] && $this->title[0] && $g_laiyuan!="新京报"){

             //数据库执行

             $atime=date('y-m-d H:i:s',time());

             $this->sqlDo($this->title[0],$atime,$this->screenData[0],$g_laiyuan,$this->fromUrl,$fenlei);

         }else{

             echo $this->title[0].$g_laiyuan."-----数据为空<br>";

         }

     }

 }

     //数据语句操作--- 对应的WordPress数据库

     function sqlDo($atitle,$atime,$acontent,$fromName,$biaoshi,$fenlei){//(文章标题,时间,内容,来源名,来源标识url,分类)

             $getBiaoshi=mysqlJi::mysqlQu("SELECT COUNT(*) as biaoshi FROM `wp_posts` WHERE post_content_filtered='{$biaoshi}';");

             $acount=mysqlJi::mysqlFeAs($getBiaoshi);//查找数据库是否已经存在该数据

             if($acount[0]['biaoshi']==0){

                 $res=mysqlJi::mysqlQu("INSERT INTO `wp_posts` (`ID`, `post_author`, `post_date`, `post_date_gmt`, `post_content`, `post_title`, `post_excerpt`, `post_status`, `comment_status`, `ping_status`, `post_password`, `post_name`, `to_ping`, `pinged`, `post_modified`, `post_modified_gmt`, `post_content_filtered`, `post_parent`, `guid`, `menu_order`, `post_type`, `post_mime_type`, `comment_count`, `laiyuan`, `description`) VALUES ('', '9192206', '{$atime}', '{$atime}', '{$acontent}', '{$atitle}', '', 'publish', 'open', 'open', '', '%e6%9e%81%e5%8c%96%e6%b3%a2', '', '', '{$atime}', '{$atime}', '{$biaoshi}', '0', '', '0', 'post', '', '0', '{$fromName}', '')");//sql语句

                 $rows=mysqlJi::mysqlRows();//受影响的行数

                 if($rows==1){

                     $inid=mysqlJi::insertId();//最后一条记录的ID

                     $urls = array('http://wenzhang.bjxxw.com/archives/'.$inid.'.html');//提交地址

                     $baiduApi = 'http://data.zz.baidu.com/urls?site=wenzhang.bjxxw.com&token=GKEJ4ENj4i6PMu51';//百度api

                     $aBaiduPush= new baiduPush($urls,$baiduApi);    //百度自动提交

                     echo $aBaiduPush->result;

                     $res2=mysqlJi::mysqlQu("INSERT INTO `wp_term_relationships` (`object_id`, `term_taxonomy_id`) VALUES ('{$inid}', '{$fenlei}')");//sql语句

                     $rows2=mysqlJi::mysqlRows();//受影响的行数

                     echo $atitle."----来源".$fromName."----插入id为".$inid."----成功<br>";

                 }else{

                     echo $atitle."----来源".$fromName."----失败<br>";

                 }

             }else{

                 echo "--数据库已经有啦--";

             }

     }

     //过滤所有html标签(数据,除了哪些标签)

     function guolv($data,$chule){

         return strip_tags($data,$chule);//除了img标签

     }

     //结束

     function __destruct(){

         curl_close($this->curl);

     }

 }

 //$aNewcont= new spider_cont("http://mil.sohu.com/20160905/n467640017.shtml");

 //$aNewcont->sohuCon(3013);

 ?>

getRollNews.php

 <?php

 //本页为新闻列表获取也面

 //引入详情页

 include ("info.php");

 ///网页爬虫列表

 class spider_list extends spider {

     //新闻列表

     function screen_list($zhengze, $fenlei) {

         $this -> data = $this -> guolv($this -> data, '<td>');

         //过滤所有html标签(数据,除了哪些标签)

         $this -> data = preg_replace("/search/si", "", $this -> data);

         //过滤head标签 单独过滤

         //过滤筛选

         preg_match_all($zhengze, $this -> data, $regArr, PREG_SET_ORDER);

         //定义一维数组

         $array = array();

         for ($i = 0; $i < count($regArr); $i++) {//二维数组转一维数组

             $array[$i] = $regArr[$i][0];

         }

         //去除重复

         $array = array_unique($array);

         //去除数组键名

         $array = array_values($array);

         for ($i = 0; $i < count($array); $i++) {//找出所有匹配的链接

             //网址列表目录

             if ($array[$i]) {

                 //                    //执行内容获取

                 $sohuList = "/http:\/\/([\.a-z]+)\.sohu\.com\/20(\d+)\/n(\d+)\.shtml/";

                 //搜狐列表过滤规则

                 $new = strip_tags($array[$i]);

                 //去除html标记

                 $new = trim($new);

                 //去空格

                 echo "<hr>".($i+1)."通过:<em>" . $new . "</em> 搜索到::";

                 $ser = rawurlencode($new);

                 //转换成url

                 $sohuUrl = new spider_sohu("http://news.sogou.com/news?query=site%3Asohu.com+" . $ser);

                 $sohuUrl -> screen_list($sohuList, $fenlei);

             }

         }

     }

     //过滤所有html标签(数据,除了哪些标签)

     function guolv($data, $chule) {

         return strip_tags($data, $chule);

     }

     //结束

     function __destruct() {

         curl_close($this -> curl);

     }

 }

 ///网页爬虫列表

 class spider_sohu extends spider {

     //新闻列表

     function screen_list($zhengze, $fenlei) {

         preg_match('/<h3 class="vrTitle">(.*?)<\/h3>/si', $this -> data, $gulv1);

         //过滤

         $this -> data = $this -> guolv($gulv1[0], '<a><h3>');

         //过滤所有html标签(数据,除了哪些标签)

 //        var_dump($laiyuan[0]);

         //过滤筛选

         preg_match($zhengze, $this -> data, $regArr);

         echo $regArr[0]."<br>";

             $aNewcont= new spider_cont($regArr[0]);

             $aNewcont->sohuCon($fenlei);

     }

     //过滤所有html标签(数据,除了哪些标签)

     function guolv($data, $chule) {

         return strip_tags($data, $chule);

     }

     //结束

     function __destruct() {

         curl_close($this -> curl);

     }

 }

     $baiduList = '/<td[^>]*class="keyword">(.*?)<\/td>/si';

     //百度列表过滤规则

     //$aNewList = new spider_list('http://top.baidu.com/buzz?b=42&c=513&fr=topbuzz_b341_c513');

     //$aNewList -> screen_list($baiduList, 239);

     $NewUrls  = array(

         array('aid'=>3021,'aurl'=>'http://top.baidu.com/buzz?b=344&c=513&fr=topbuzz_b42_c513','aname'=>'娱乐'),//

         array('aid'=>2585,'aurl'=>'http://top.baidu.com/buzz?b=341&c=513&fr=topbuzz_b1_c513','aname'=>'今日热点'),//

         array('aid'=>2585,'aurl'=>'http://top.baidu.com/buzz?b=1&c=513&fr=topbuzz_b344_c513','aname'=>'热点'),//

         array('aid'=>2585,'aurl'=>'http://top.baidu.com/buzz?b=42&c=513&fr=topbuzz_b341_c513','aname'=>'热点'),//

     );

     for ($i = 0; $i < count($NewUrls); $i++) {//找出所有匹配的链接

                 echo "<br>-----------分类-----------".$NewUrls[$i]['aname']."------------<br>";

                 $aNewList = new spider_list($NewUrls[$i]['aurl']);

                 $aNewList->screen_list($sohuList,$NewUrls[$i]['aid']);

     }

     //关闭数据库

     mysqlJi::mysqlCl();

 ?>

后来加入了新闻内同义词自动替换。发现替换后，新闻阅读起来太伤眼，已弃之。

基于php编写的新闻类爬虫，插入WordPress数据库的更多相关文章

新闻类爬虫库：Newspaper
newspaper库是一个主要用来提取新闻内容及分析的Python爬虫框架.此库适合抓取新闻网页.操作简单易学,即使对完全没了解过爬虫的初学者也非常的友好,简单学习就能轻易上手,除此之外,使用过程你不 ...
PHP基于单例模式编写PDO类的方法
一.单例模式简介简单的说,一个对象(在学习设计模式之前,需要比较了解面向对象思想)只负责一个特定的任务: 二.为什么要使用PHP单例模式? 1.php的应用主要在于数据库应用, 所以一个应用中会存在 ...
GNE: 4行代码实现新闻类网站通用爬虫
GNE(GeneralNewsExtractor)是一个通用新闻网站正文抽取模块,输入一篇新闻网页的 HTML, 输出正文内容.标题.作者.发布时间.正文中的图片地址和正文所在的标签源代码.GNE在提 ...
基于Node.js实现一个小小的爬虫
以前一直听说有爬虫这种东西,稍微看了看资料,貌似不是太复杂. 正好了解过node.js,那就基于它来个简单的爬虫. 1.本次爬虫目标: 从拉钩招聘网站中找出“前端开发”这一类岗位的信息,并作相应页面分 ...
【转】发布一个基于NGUI编写的UI框架
发布一个基于NGUI编写的UI框架 1.加载,显示,隐藏,关闭页面,根据标示获得相应界面实例 2.提供界面显示隐藏动画接口 3.单独界面层级,Collider,背景管理 4.根据存储的导航信息完成界面 ...
artDialog是一个基于javascript编写的对话框组件，它拥有精致的界面与友好的接口
artDialog是一个基于javascript编写的对话框组件,它拥有精致的界面与友好的接口自适应内容 artDialog的特殊UI框架能够适应内容变化,甚至连外部程序动态插入的内容它仍然能自适应 ...
Java豆瓣电影爬虫——减少与数据库交互实现批量插入
节前一个误操作把mysql中record表和movie表都清空了,显然我是没有做什么mysql备份的.所以,索性我把所有的表数据都清空的,一夜回到解放前…… 项目地址:https://github.c ...
2014金山笔试_编写一个数组类 MyVector
//编写一个数组类 MyVector,数组内容可以动态扩充,实现构造,析构,赋值操作符重载,插入,删除,获取元素个数,获取数组容量(不可以使用STL等的容器类,不能使用 //不连续的存储空间) #in ...
13.CrawlSpider类爬虫
1.CrawlSpider介绍 Scrapy框架中分两类爬虫,Spider类和CrawlSpider类. 此案例采用的是CrawlSpider类实现爬虫. 它是Spider的派生类,Spider类的设 ...

随机推荐

Linux以字节显示内存大小
Linux以字节显示内存大小 youhaidong@youhaidong-ThinkPad-Edge-E545:~$ free -b total used free shared buffers ca ...
Carries SCU - 4437
Carries frog has nn integers a1,a2,-,ana1,a2,-,an, and she wants to add them pairwise. Unfortunately ...
java中回调函数的理解
一,案例一 "通常大家说的回调函数一般就是按照别人(李四)的定好的接口规范写,等待别人(张三)调用的函数,在C语言中,回调函数通常通过函数指针来传递:在Java中,通常就是编写另外一个类或类 ...
spring拦截器的简单实现Interceptor
原文链接:http://lixuanbin.iteye.com/blog/2250100 1. 需求描述某内部管理系统采用Spring MVC搭建,用户可以登录系统进行CRUD以及其他的一些日常管理 ...
Apace Ignite剖析
1.概述 Apache Ignite和Apache Arrow很类似,属于大数据范畴中的内存分布式管理系统.在<Apache Arrow 内存数据>中介绍了Arrow的相关内容,它统一了大 ...
谈谈在.NET Core中使用Redis和Memcached的序列化问题
前言在使用分布式缓存的时候,都不可避免的要做这样一步操作,将数据序列化后再存储到缓存中去. 序列化这一操作,或许是显式的,或许是隐式的,这个取决于使用的package是否有帮我们做这样一件事. 本文 ...
平面图转对偶图(Bzoj1001：狼抓兔子)
如果只会用最小割做这道题那就太菜辣引入来自某学长平面图:在平面上边不相交的图(边可以绕着画) 那么平面图的边与边就围成了许多个区域(这与你画图的方式有关) 定义对偶图:把相邻的两个区域连上边,形 ...
依赖反转原则DIP 与 asp.net core 项目结构
DIP 依赖反转原则 Dependency Inversion Principle 的定义如下: 高级别的模块不应该依赖于低级别的模块, 他们都应该依赖于抽象. 假设Controller依赖于Repo ...
Android开发——设置界面的创建
前言: 最近忙着搞项目,难得有时间,便来整理搞项目中学习到的知识使用之前,先介绍一下android这种的五种数据储存方式,分别为文件储存,SharePrefence,SQL,使用ContentPro ...
templet模式
package template;import java.sql.Connection;import java.sql.ResultSet;/** * Created by marcopan on 1 ...

基于php编写的新闻类爬虫，插入WordPress数据库

基于php编写的新闻类爬虫，插入WordPress数据库的更多相关文章

随机推荐

热门专题