C#实现网页爬虫

HTTP请求工具类(功能：1、获取网页html；2、下载网络图片；)：

using System;

using System.Collections.Generic;

using System.Drawing;

using System.IO;

using System.Linq;

using System.Net;

using System.Text;

using System.Threading.Tasks;

using System.Windows.Forms;

namespace Utils

{

    /// <summary>

    /// HTTP请求工具类

    /// </summary>

    public class HttpRequestUtil

    {

        /// <summary>

        /// 获取页面html

        /// </summary>

        public static string GetPageHtml(string url)

        {

            // 设置参数

            HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;

            request.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)";

            //发送请求并获取相应回应数据

            HttpWebResponse response = request.GetResponse() as HttpWebResponse;

            //直到request.GetResponse()程序才开始向目标网页发送Post请求

            Stream responseStream = response.GetResponseStream();

            StreamReader sr = new StreamReader(responseStream, Encoding.UTF8);

            //返回结果网页（html）代码

            string content = sr.ReadToEnd();

            return content;

        }

        /// <summary>

        /// Http下载文件

        /// </summary>

        public static void HttpDownloadFile(string url, int minWidth, int minHeight)

        {

            int pos = url.LastIndexOf("/") + ;

            string fileName = url.Substring(pos);

            string path = Application.StartupPath + "\\download";

            if (!Directory.Exists(path))

            {

                Directory.CreateDirectory(path);

            }

            string filePathName = path + "\\" + fileName;

            if (File.Exists(filePathName)) return;

            // 设置参数

            HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;

            request.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)";

            request.Proxy = null;

            //发送请求并获取相应回应数据

            HttpWebResponse response = request.GetResponse() as HttpWebResponse;

            //直到request.GetResponse()程序才开始向目标网页发送Post请求

            Stream responseStream = response.GetResponseStream();

            MemoryStream memoryStream = new MemoryStream();

            byte[] bArr = new byte[];

            int size = responseStream.Read(bArr, , (int)bArr.Length);

            while (size > )

            {

                memoryStream.Write(bArr, , size);

                size = responseStream.Read(bArr, , (int)bArr.Length);

            }

            Image tempImage = System.Drawing.Image.FromStream(memoryStream, true);

            int imageHeight = tempImage.Height;

            int imageWidth = tempImage.Width;

            if (imageHeight >= minHeight && imageWidth >= minWidth)

            {

                memoryStream.Seek(, SeekOrigin.Begin);

                size = memoryStream.Read(bArr, , (int)bArr.Length);

                FileStream fs = new FileStream(filePathName, FileMode.Create);

                while (size > )

                {

                    fs.Write(bArr, , size);

                    size = memoryStream.Read(bArr, , (int)bArr.Length);

                }

                fs.Close();

            }

            memoryStream.Close();

            responseStream.Close();

        }

    }

}

VisitedHelper类：

using System;

using System.Collections.Generic;

using System.IO;

using System.Linq;

using System.Text;

using System.Threading.Tasks;

using System.Windows.Forms;

namespace Utils

{

    /// <summary>

    /// 已访问的网址列表

    /// </summary>

    public class VisitedHelper

    {

        private static List<string> m_VisitedList = new List<string>();

        #region 判断是否已访问

        /// <summary>

        /// 判断是否已访问

        /// </summary>

        public static bool IsVisited(string url)

        {

            if (m_VisitedList.Exists(a => a == url))

            {

                return true;

            }

            return false;

        }

        #endregion

        #region 添加已访问

        /// <summary>

        /// 添加已访问

        /// </summary>

        public static void Add(string url)

        {

            m_VisitedList.Add(url);

        }

        #endregion

    }

}

多线程爬取网页代码：

using System;

using System.Collections.Generic;

using System.ComponentModel;

using System.Data;

using System.Drawing;

using System.IO;

using System.Linq;

using System.Text;

using System.Text.RegularExpressions;

using System.Threading;

using System.Threading.Tasks;

using System.Windows.Forms;

using Utils;

namespace 爬虫

{

    public partial class Form1 : Form

    {

        private static int m_MinWidth = ;

        private static int m_MinHeight = ;

        private static int m_CompletedCount = ;

        public Form1()

        {

            InitializeComponent();

        }

        private void button1_Click(object sender, EventArgs e)

        {

            ThreadPool.SetMaxThreads(, );

            int.TryParse(txtMinWidth.Text, out m_MinWidth);

            int.TryParse(txtMinHeight.Text, out m_MinHeight);

            button1.Enabled = false;

            lblMsg.Text = "正在爬取图片…";

            timer1.Start();

            new Thread(new ThreadStart(delegate()

            {

                Crawling(txtUrl.Text, null);

            })).Start();

        }

        /// <summary>

        /// 爬取

        /// </summary>

        private void Crawling(string url, string host)

        {

            if (!VisitedHelper.IsVisited(url))

            {

                VisitedHelper.Add(url);

                if (host == null)

                {

                    host = GetHost(url);

                }

                string pageHtml = HttpRequestUtil.GetPageHtml(url);

                Regex regA = new Regex(@"<a[\s]+[^<>]*href=(?:""|')([^<>""']+)(?:""|')[^<>]*>[^<>]+</a>", RegexOptions.IgnoreCase);

                Regex regImg = new Regex(@"<img[\s]+[^<>]*src=(?:""|')([^<>""']+(?:jpg|jpeg|png|gif))(?:""|')[^<>]*>", RegexOptions.IgnoreCase);

                MatchCollection mcImg = regImg.Matches(pageHtml);

                foreach (Match mImg in mcImg)

                {

                    string imageUrl = mImg.Groups[].Value;

                    try

                    {

                        int imageWidth = GetImageWidthOrHeight(mImg.Value, true);

                        int imageHeight = GetImageWidthOrHeight(imageUrl, false);

                        if (imageWidth >= m_MinWidth && imageHeight >= m_MinHeight)

                        {

                            if (imageUrl.IndexOf("javascript") == -)

                            {

                                if (imageUrl.IndexOf("http") == )

                                {

                                    HttpRequestUtil.HttpDownloadFile(imageUrl, m_MinWidth, m_MinHeight);

                                }

                                else

                                {

                                    HttpRequestUtil.HttpDownloadFile(host + imageUrl, m_MinWidth, m_MinHeight);

                                }

                            }

                        }

                    }

                    catch { }

                }

                //递归遍历

                MatchCollection mcA = regA.Matches(pageHtml);

                foreach (Match mA in mcA)

                {

                    try

                    {

                        string nextUrl = mA.Groups[].Value;

                        if (nextUrl.IndexOf("javascript") == -)

                        {

                            if (nextUrl.IndexOf("http") == )

                            {

                                if (GetHost(url) == host)

                                {

                                    ThreadPool.QueueUserWorkItem(new WaitCallback(delegate(object obj)

                                    {

                                        try

                                        {

                                            Crawling(nextUrl, host);

                                            m_CompletedCount++;

                                        }

                                        catch { }

                                    }));

                                }

                            }

                            else

                            {

                                if (GetHost(url) == host)

                                {

                                    ThreadPool.QueueUserWorkItem(new WaitCallback(delegate(object obj)

                                    {

                                        try

                                        {

                                            Crawling(host + nextUrl, host);

                                            m_CompletedCount++;

                                        }

                                        catch { }

                                    }));

                                }

                            }

                        }

                    }

                    catch { }

                }

            }

        } //end Crawling方法

        /// <summary>

        /// 获取主机

        /// </summary>

        private string GetHost(string url)

        {

            Regex regHost = new Regex(@"(?:http|https)://[a-z0-9\-\.:]+", RegexOptions.IgnoreCase);

            Match mHost = regHost.Match(url);

            return mHost.Value + "/";

        }

        //计时器事件

        private void timer1_Tick(object sender, EventArgs e)

        {

            int workerThreads;

            int completionPortThreads;

            ThreadPool.GetAvailableThreads(out workerThreads, out completionPortThreads);

            if (workerThreads ==  && m_CompletedCount > )

            {

                lblMsg.Text = "已结束";

            }

            else

            {

                lblMsg.Text = "正在爬取图片…";

            }

        }

        /// <summary>

        /// 获取图片宽度或高度

        /// </summary>

        private int GetImageWidthOrHeight(string imageTagString, bool isWidth)

        {

            string tag = isWidth ? "width" : "height";

            Regex reg = new Regex(string.Format(@"{0}=""([\d\.]+)""", tag), RegexOptions.IgnoreCase);

            Match match = reg.Match(imageTagString);

            if (match.Success)

            {

                return (int)Convert.ToDouble(match.Groups[].Value);

            }

            else

            {

                reg = new Regex(string.Format(@"{0}[\s]*:[\s]*([\d\.]+)[\s]*px[\s]*;", tag), RegexOptions.IgnoreCase);

                match = reg.Match(imageTagString);

                if (match.Success)

                {

                    return (int)Convert.ToDouble(match.Groups[].Value);

                }

            }

            return int.MaxValue;

        }

    } //end Form1类

    /// <summary>

    /// 跨线程访问控件的委托

    /// </summary>

    public delegate void InvokeDelegate();

}

截图：

C#实现网页爬虫的更多相关文章

cURL 学习笔记与总结（2）网页爬虫、天气预报
例1.一个简单的 curl 获取百度 html 的爬虫程序(crawler): spider.php <?php /* 获取百度html的简单网页爬虫 */ $curl = curl_init( ...
c#网页爬虫初探
一个简单的网页爬虫例子! html代码: <head runat="server"> <title>c#爬网</title> </head ...
网页爬虫--scrapy入门
本篇从实际出发,展示如何用网页爬虫.并介绍一个流行的爬虫框架~ 1. 网页爬虫的过程所谓网页爬虫,就是模拟浏览器的行为访问网站,从而获得网页信息的程序.正因为是程序,所以获得网页的速度可以轻易超过单 ...
网页爬虫的设计与实现（Java版）
网页爬虫的设计与实现(Java版) 最近为了练手而且对网页爬虫也挺感兴趣,决定自己写一个网页爬虫程序. 首先看看爬虫都应该有哪些功能. 内容来自(http://www.ibm.com/deve ...
Python 网页爬虫 & 文本处理 & 科学计算 & 机器学习 & 数据挖掘兵器谱（转）
原文:http://www.52nlp.cn/python-网页爬虫-文本处理-科学计算-机器学习-数据挖掘曾经因为NLTK的缘故开始学习Python,之后渐渐成为我工作中的第一辅助脚本语言,虽然开 ...
[resource-]Python 网页爬虫 & 文本处理 & 科学计算 & 机器学习 & 数据挖掘兵器谱
reference: http://www.52nlp.cn/python-%e7%bd%91%e9%a1%b5%e7%88%ac%e8%99%ab-%e6%96%87%e6%9c%ac%e5%a4% ...
网页抓取：PHP实现网页爬虫方式小结
来源:http://www.ido321.com/1158.html 抓取某一个网页中的内容,需要对DOM树进行解析,找到指定节点后,再抓取我们需要的内容,过程有点繁琐.LZ总结了几种常用的.易于实现 ...
Java正则表达式--网页爬虫
网页爬虫:其实就一个程序用于在互联网中获取符合指定规则的数据爬取邮箱地址,爬取的源不同,本地爬取或者是网络爬取 (1)爬取本地数据: public static List<String> ...
从robots.txt開始网页爬虫之旅
做个网页爬虫或搜索引擎(下面统称蜘蛛程序)的各位一定不会陌生,在爬虫或搜索引擎訪问站点的时候查看的第一个文件就是robots.txt了.robots.txt文件告诉蜘蛛程序在server上什么文件是能 ...
Python网页爬虫（一）
很多时候我们想要获得网站的数据,但是网站并没有提供相应的API调用,这时候应该怎么办呢?还有的时候我们需要模拟人的一些行为,例如点击网页上的按钮等,又有什么好的解决方法吗?这些正是python和网页爬 ...

随机推荐

js实现快速排序
非原创: var quickSort = function(arr){ if(arr.length<=1){return arr;} var pivotIndex = Math.floor(ar ...
如何为编程爱好者设计一款好玩的智能硬件（七）——LCD1602点阵字符型液晶显示模块驱动封装（上）
当前进展: 一.我的构想:如何为编程爱好者设计一款好玩的智能硬件(一)——即插即用.积木化.功能重组的智能硬件模块构想二.别人家的孩子:如何为编程爱好者设计一款好玩的智能硬件(二)——别人是如何设计 ...
[HIMCM暑期班]第2课:建模
第二节课从最简单的模型开始入手:七桥问题. 首先,先去wikipedia上了解一些有关七桥问题的背景知识.http://en.wikipedia.org/wiki/Seven_Bridges_of_K ...
Linux-磁盘管理小结
这篇博文主要总结了Linux磁盘的一些操作,主要是硬盘的加载,分区(MBR分区和GPT分区),分区的挂载,以及swap分区的加载设置. 基础命令 df查看磁盘分区使用状况 -l //仅显示本地磁盘(默 ...
知方可补不足～sqlserver中使用ROW_NUMBER进行的快速分页
回到目录这个在SQL2005之后最见的一种分页方式,也是Linq默认生成的执行分页的方法(skip,take),当然在性能上小数量没有问题,在数据达到百万时会很慢,这是我们要清楚的,有时我们在LIN ...
模拟淘宝登录和购物车功能：使用cookie记录登录名，下次登录时能够记得上次的登录名，使用cookie模拟购物车功能，使用session记住登录信息并验证是否登录，防止利用url打开网站，并实现退出登录功能
Login <%@ page language="java" contentType="text/html; charset=UTF-8" pageEnc ...
C#并行编程-Task
菜鸟学习并行编程,参考<C#并行编程高级教程.PDF>,如有错误,欢迎指正. 目录 C#并行编程-相关概念 C#并行编程-Parallel C#并行编程-Task C#并行编程-并发集合 ...
WPF入门教程系列八——布局之Grid与UniformGrid（三）
五. Grid Grid顾名思义就是“网格”,它的子控件被放在一个一个实现定义好的小格子里面,整齐配列. Grid和其他各个Panel比较起来,功能最多也最为复杂.要使用Grid,首先要向RowDef ...
编写Shader时的一些性能考虑
编写shader时的一些建议:1.只计算需要计算的东西:2.通常,需要渲染的像素比顶点数多,而顶点数又比物体数多很多.所以如果可以,尽量将运算从PS移到VS,或直接通过script来设置某些固定值:3 ...
AngularJs单元测试
这篇文章主要介绍了angularJS中的单元测试实例,本文主要介绍利用Karma和Jasmine来进行ng模块的单元测试,并用Istanbul 来生成代码覆盖率测试报告,需要的朋友们可以参考下,以下 ...

C#实现网页爬虫

C#实现网页爬虫的更多相关文章

随机推荐

热门专题