C# 百度搜索结果xpath分析

using System;

using System.Collections.Generic;

using System.IO;

using System.Linq;

using System.Net;

using System.Text;

using System.Threading.Tasks;

using HtmlAgilityPack;

namespace xpathGet

{

    class Program

    {

        #region      webclient创建

        public class WebClientBD : System.Net.WebClient

        {

            protected override System.Net.WebRequest GetWebRequest(Uri address)

            {

                HttpWebRequest request = base.GetWebRequest(address) as HttpWebRequest;

                request.AllowAutoRedirect = false;

                request.AutomaticDecompression = DecompressionMethods.Deflate | DecompressionMethods.GZip;

                request.UserAgent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36";

                return request;

            }

            public WebResponse Response { get; private set; }

            protected override WebResponse GetWebResponse(WebRequest request)

            {

                try

                {

                    this.Response = base.GetWebResponse(request);

                }

                catch { }

                return this.Response;

            }

        }

        public static string lastUrl(string url)

        {

            byte[] pageData = null;

            string lasturl = null;

            string redirectLocal = null;

            try

            {

                WebClientBD wc = new WebClientBD(); // 创建WebClient实例提供向URI 标识的资源发送数据和从URI 标识的资源接收数据

                wc.Credentials = CredentialCache.DefaultCredentials; // 获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。

                Encoding enc = Encoding.GetEncoding("utf-8"); // 如果是乱码就改成 utf-8 / GB2312  

                pageData = wc.DownloadData(url); // 从资源下载数据并返回字节数组。                

                if ((wc.Response as HttpWebResponse).StatusCode == HttpStatusCode.Found)

                {

                    redirectLocal = ((wc.Response as HttpWebResponse).Headers["location"].StartsWith("http") == true ? string.Empty : "http://www.baidu.com") + (wc.Response as HttpWebResponse).Headers["location"];

                    wc = new WebClientBD(); // 创建WebClient实例提供向URI 标识的资源发送数据和从URI 标识的资源接收数据

                    wc.Credentials = CredentialCache.DefaultCredentials; // 获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。

                    pageData = wc.DownloadData(redirectLocal);

                    if ((wc.Response as HttpWebResponse).StatusCode == HttpStatusCode.Found)

                    {

                        lasturl = (wc.Response as HttpWebResponse).Headers["location"];

                    }

                    else if ((wc.Response as HttpWebResponse).StatusCode == HttpStatusCode.OK)

                    {

                        lasturl = redirectLocal;

                    }

                }

                return lasturl;

            }

            catch (Exception ex)

            {

                return "error:" + ex.Message;

            }

        }

        #endregion

        public static string GetHtmlSource(string url)

        {

            WebClientBD wc = new WebClientBD(); // 创建WebClient实例提供向URI 标识的资源发送数据和从URI 标识的资源接收数据

            wc.Credentials = CredentialCache.DefaultCredentials; // 获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。

            Encoding enc = Encoding.GetEncoding("utf-8"); // 如果是乱码就改成 utf-8 / GB2312

            var pageData = wc.DownloadData(url); // 从资源下载数据并返回字节数组。

            return enc.GetString(pageData);

        }

        static void Main(string[] args)

        {

            #region  翻页URL

            //http://www.baidu.com/s?wd={0}&pn={1}&oq={0}&ie=utf-8&usm=4

            #endregion

            string address = "http://www.baidu.com/s?wd={0}&pn={1}0&oq={0}&ie=utf-8&usm=4";

            string key = "代购";

            string htmlPageSource = string.Empty;

            List<string> 其他最后的List = new List<string>();

            List<string> 广告最后的List = new List<string>();

            string 广告 = "//div[@id='content_left']//div[contains(@id,'300')]{0}|//div[@id='content_left']//div[contains(@id,'400')]{1}";

            string 其他 = "//div[@id='content_left']//div[contains(@class,'c-container')]{0}";

            //取标题                      

            for (int pnIndex = 0; pnIndex < 5; pnIndex++)

            {

                HtmlDocument doc = new HtmlDocument();

                List<string> 其他标题List = new List<string>();

                List<string> 其他标题链接List = new List<string>();

                List<string> 其他内容List = new List<string>();

                List<string> 其他citeList = new List<string>();

                List<string> 广告标题List = new List<string>();

                List<string> 广告标题链接List = new List<string>();

                List<string> 广告内容List = new List<string>();

                List<string> 广告citeList = new List<string>();

                htmlPageSource = GetHtmlSource(string.Format(address, key, pnIndex.ToString()));

                doc.LoadHtml(htmlPageSource);

               // doc.Load("D:\\rootInfo.html", Encoding.UTF8);

                HtmlNodeCollection 广告标题 = doc.DocumentNode.SelectNodes(string.Format(广告, "/div[1]/h3/a[1]", "/div[1]/h3/a[1]"));

                HtmlNodeCollection 广告标题链接 = doc.DocumentNode.SelectNodes(string.Format(广告, "/div[1]/h3/a[1]", "/div[1]/h3/a[1]"));

                HtmlNodeCollection 广告内容 = doc.DocumentNode.SelectNodes(string.Format(广告, "/div[2]", "/div[2]"));

                HtmlNodeCollection 广告cite = doc.DocumentNode.SelectNodes(string.Format(广告, "/div[2]//a/span[1]", "/div[3]/a/span"));

                HtmlNodeCollection 其他标题 = doc.DocumentNode.SelectNodes(string.Format(其他, "/h3/a[1]"));

                HtmlNodeCollection 其他标题链接 = doc.DocumentNode.SelectNodes(string.Format(其他, "/h3/a[1]"));

                HtmlNodeCollection 其他内容 = doc.DocumentNode.SelectNodes(string.Format(其他, "//div[@class='c-abstract']") + "|" + string.Format(其他, "//div['c-span18 c-span-last']/p[1]") + "|" + string.Format(其他, "//div[@class='c-offset']") + "|" + string.Format(其他, "//div[@class='op_dict_content']") + "|" + string.Format(其他, "//p[contains(text(),'由于该网站的robots.txt文件存在限制指令')]"));

                HtmlNodeCollection 其他cite = doc.DocumentNode.SelectNodes(string.Format(其他, "//span[@class='c-showurl']") + "|" + string.Format(其他, "//a[@class='c-showurl']"));

                //分析每个结果都有一个标题，现在是最新相关信息的结果没有cite，判断哪个没有cite给其赋值“new info”

                //如果有最新相关信息的结果 ，假设标题有9个结果，则cite有8个。

                // 1.如果最新消息在最后一个此时标题的index=8，cite的index=8,最后一个标题index=9，但是cite的index是不存在的所以添加一个元素“new info。其他位置则插入元素

                foreach (var item in 其他cite)

                {

                    其他citeList.Add(item.InnerText.Trim().Replace(" ", String.Empty).Replace("\n", string.Empty).Replace(" ", string.Empty));

                }

                foreach (var item in 其他内容)

                {

                    其他内容List.Add(item.InnerText.Trim().Replace(" ", String.Empty).Replace("\n", string.Empty).Replace(" ", string.Empty));

                }

                for (int i = 0; i < 其他标题.Count; i++)

                {

                    其他标题List.Add(其他标题[i].InnerText.Trim().Replace(" ", String.Empty).Replace("\n", string.Empty).Replace(" ", string.Empty));

                    其他标题链接List.Add(其他标题链接[i].GetAttributeValue("href", "").Trim().Replace(" ", String.Empty).Replace("\n", string.Empty).Replace(" ", string.Empty));

                }

                if (其他标题List.Count != 其他citeList.Count)

                {

                    if (其他标题[其他citeList.Count].InnerText.Contains("的最新相关信息"))

                    {

                        其他citeList.Add("new info");

                    }

                    else

                    {

                        for (int i = 0; i < 其他标题List.Count; i++)

                        {

                            if (其他标题List[i].Contains("的最新相关信息"))

                            {

                                其他citeList.Insert(i, "new info");

                            }

                        }

                    }

                }

                //   List<string> 其他最后的List = new List<string>();

                for (int j = 0; j < 其他标题List.Count; j++)

                {

                    其他最后的List.Add(其他标题List[j] + "|" + 其他标题链接List[j] + "|" + 其他内容List[j] + "|" + 其他citeList[j] + "\t");

                }

                其他最后的List.Add(String.Format("以上为第{0}页搜索结果。", pnIndex + 1));

                string path = @"d:\\infolist_Page" + (pnIndex+1) + ".html";

                File.WriteAllText(path, htmlPageSource, Encoding.UTF8);

                for (int i = 0; i < 广告标题.Count; i++)

                {

                    广告标题List.Add(广告标题[i].InnerText.Trim().Replace(" ", String.Empty).Replace("\n", string.Empty).Replace(" ", string.Empty));

                    广告内容List.Add(广告内容[i].InnerText.Trim().Replace(" ", String.Empty).Replace("\n", string.Empty).Replace(" ", string.Empty));

                    广告标题链接List.Add(广告标题链接[i].GetAttributeValue("href", "").Trim().Replace(" ", String.Empty).Replace("\n", string.Empty).Replace(" ", string.Empty));

                    广告citeList.Add(广告cite[i].InnerText.Trim().Replace(" ", String.Empty).Replace("\n", string.Empty).Replace(" ", string.Empty));

                }

                for (int j = 0; j < 广告标题List.Count; j++)

                {

                    广告最后的List.Add(广告标题List[j] + "|" + 广告标题链接List[j] + "|" + 广告内容List[j] + "|" + 广告citeList[j] + "\t");

                }

                广告最后的List.Add(String.Format("以上为第{0}页搜索结果。", pnIndex + 1));

            }

            File.WriteAllLines(@"d:\\infolist.txt", 其他最后的List.ToArray(), Encoding.UTF8);

            File.WriteAllLines(@"d:\\infolist2.txt", 广告最后的List.ToArray(), Encoding.UTF8);

        }

    }

}

C# 百度搜索结果xpath分析的更多相关文章

PHP网络爬虫实践：抓取百度搜索结果，并分析数据结构
百度的搜索引擎有反爬虫机制,我先直接用guzzle试试水.代码如下: <?php /** * Created by Benjiemin * Date: 2020/3/5 * Time: 14:5 ...
python--selenium简单模拟百度搜索点击器
python--selenium简单模拟百度搜索点击器发布时间:2018-02-28 来源:网络上传者:用户关键字: selenium 模拟简单点击搜索百度发表文章摘要:用途:简单模拟 ...
Python：输入关键字进行百度搜索并爬取搜索结果
学习自:手把手教你用Python爬取百度搜索结果并保存 - 云+社区 - 腾讯云如何利用python模拟百度搜索,Python交流,技术交流区,鱼C论坛指定关键字,对其进行百度搜索,保存搜索结果, ...
Splinter学习－－初探1，模拟百度搜索
Splinter是以Selenium, PhantomJS 和 zope.testbrowser为基础构建的web自动化测试工具,基本原理同selenium 支持的浏览器包括:Chrome, Fire ...
利用python爬取海量疾病名称百度搜索词条目数的爬虫实现
实验原因: 目前有一个医疗百科检索项目,该项目中对关键词进行检索后,返回的结果很多,可惜结果的排序很不好,影响用户体验.简单来说,搜索出来的所有符合疾病中,有可能是最不常见的疾病是排在第一个的,而最有 ...
jsonp跨越请求百度搜索api 实现下拉列表提示
题目来源: 最近在做百度IFE前端技术学院的题,然后有一题就是模拟百度搜索智能提示.题目是开源的,稍后给出地址. 因为博主没学过后端啊,欲哭无泪,所以不能实现后端模糊搜索,那如果前端ajax纯粹请求一 ...
百度搜索URL参数搜索关键字
http://www.baidu.com/s?wd=关键字 wd(Keyword):查询的关键词: http://www.baidu.com/s?wd=关键字&cl=3 cl(Class):搜 ...
python爬取百度搜索结果ur汇总
写了两篇之后,我觉得关于爬虫,重点还是分析过程分析些什么呢: 1)首先明确自己要爬取的目标比如这次我们需要爬取的是使用百度搜索之后所有出来的url结果 2)分析手动进行的获取目标的过程,以便以程序 ...
使用curl制作简易百度搜索
这几天研究了一下php中的curl类库,做了一个简单的百度搜索,先上代码 <div style="width:200px;height:100px;"> <div ...

随机推荐

alpha阶段的 postmortem 报告
1. 每个成员到了第二次alpha 阶段与第一次相比,取得什么进步? 成员黄杰学会了app环境的搭建和代码的基本理解李炫宗更加明白安卓代码的编写和理解康取对安卓界面的设计有一些了解 ...
英语学习APP
第一部分调研, 评测下载并使用,描述最简单直观的个人第一次上手体验. 界面高大上,看起来很美观,是个不错的英语学习软件.我很喜欢. 2.按照<构建之法>13.1节描述的 bug 定义, ...
helm的安装于与简单使用
根据 csdn 博客整理学习原始博客地址: https://blog.csdn.net/weiguang1017/article/details/78045013 1. 下载所需要的文件: 客户端文 ...
android管理SD卡 mksdcard
在创建Android模拟器的时候,会创建一个虚拟的sd卡.我们还可以通过mksdcard命令创建sd卡,在运行模拟器的时候,可以选择具体的sd卡. 1.创建sd卡: mksdcard [-l labl ...
【IneliJ 】使用IneliJ IDEA 2016将Java Web项目导出为War包
本文记录使用IDEA导出war包的过程以及碰到问题的解决办法虽说现在改用IDEA进行开发了,但还是用eclipse打war包 ….囧这样下去不是办法... 于是今天就试着使用IDEA进行打包. 项 ...
obj.getClass() == Person.class 用于判断类型
obj.getClass() == Person.class 用于判断类型
洛谷P13445 [USACO5.4]奶牛的电信Telecowmunication（网络流）
题目描述农夫约翰的奶牛们喜欢通过电邮保持联系,于是她们建立了一个奶牛电脑网络,以便互相交流.这些机器用如下的方式发送电邮:如果存在一个由c台电脑组成的序列a1,a2,...,a(c),且a1与a2相 ...
Fire Net HDU - 1045（二分匹配）
把每一列中相邻的 . 缩为一个点作为二分图的左边把每一行中相邻的 . 缩为一个点作为二分图的右边然后求最大匹配即可这题用匈牙利足够了...然而..我用了hk...有点大材小用的感觉// ...
NOI备战总结ing……
持续做题ing…… 已完成: 树套树点分治博弈论凸包杜教筛反演 FFT 数位DP DP专栏网络流数学专栏正在进行中: waiting: SAM Kd-tree 矩阵树分治 FWT B ...
oracle递归查询（查询条件ID下得所有子集）
一.CREATE TABLE TBL_TEST ( ID NUMBER, NAME VARCHAR2(100 BYTE), PID NUMBER ...

C# 百度搜索结果xpath分析

C# 百度搜索结果xpath分析的更多相关文章

随机推荐

热门专题