分享一个c#t的网页抓取类

using System;

using System.Collections.Generic;

using System.Web;

using System.Text;

using System.Net;

using System.IO;

using System.Text.RegularExpressions;

using System.Collections;

using System.IO.Compression;

/// <summary>

///Name:网页抓取类

///Author:loafinweb

///Date:2011-09-12

/// </summary>

public class webCrawl

{

    public webCrawl() { }

    //获取网页字符根据url

    public static string getHtml(string url)

    {

        try

        {

            string str = "";

            Encoding en = Encoding.GetEncoding(getEncoding(url));

            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);

            request.Headers.Set("Pragma", "no-cache");

            request.Timeout = 30000;

            HttpWebResponse response = (HttpWebResponse)request.GetResponse();

            if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)

            {

                Stream strM = response.GetResponseStream();

                StreamReader sr = new StreamReader(strM, en);

                str = sr.ReadToEnd();

                strM.Close();

                sr.Close();

            }

            return str;

        }

        catch

        {

            return String.Empty;

        }

    }

    //获取编码

    public static string getEncoding(string url)

    {

        HttpWebRequest request = null;

        HttpWebResponse response = null;

        StreamReader reader = null;

        try

        {

            request = (HttpWebRequest)WebRequest.Create(url);

            request.Timeout = 30000;

            request.AllowAutoRedirect = false;

            response = (HttpWebResponse)request.GetResponse();

            if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)

            {

                if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))

                    reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress));

                else

                    reader = new StreamReader(response.GetResponseStream(), Encoding.ASCII);

                string html = reader.ReadToEnd();

                Regex reg_charset = new Regex(@"charset\b\s*=\s*(?<charset>[^""]*)");

                if (reg_charset.IsMatch(html))

                {

                    return reg_charset.Match(html).Groups["charset"].Value;

                }

                else if (response.CharacterSet != string.Empty)

                {

                    return response.CharacterSet;

                }

                else

                    return Encoding.Default.BodyName;

            }

        }

        catch (Exception ex)

        {

            throw new Exception(ex.Message);

        }

        finally

        {

            if (response != null)

            {

                response.Close();

                response = null;

            }

            if (reader != null)

                reader.Close();

            if (request != null)

                request = null;

        }

        return Encoding.Default.BodyName;

    }

    //根据内容--获取标题

    public static string getTitle(string url)

    {

        string title = string.Empty;

        string htmlStr = getHtml(url);//获取网页

        Match TitleMatch = Regex.Match(htmlStr, "<title>([^<]*)</title>", RegexOptions.IgnoreCase | RegexOptions.Multiline);

        title = TitleMatch.Groups[1].Value;

        title = Regex.Replace(title, @"\W", "");//去除空格

        return title;

    }

    //根据内容--获取描述信息

    public static string getDescription(string url)

    {

        string htmlStr = getHtml(url);

        Match Desc = Regex.Match(htmlStr, "<meta name=\"Description\" content=\"([^<]*)\"*>", RegexOptions.IgnoreCase | RegexOptions.Multiline);

        string mdd = Desc.Groups[1].Value;

        return Regex.Replace(Desc.Groups[1].Value, @"\W", "");

    }

    //根据内容--获取所有链接

    public static List<string> getLink(string htmlStr)

    {

        List<string> list = new List<string>(); //用来存放链接

        String reg = @"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?";  //链接的正则表达式

        Regex regex = new Regex(reg, RegexOptions.IgnoreCase);

        MatchCollection mc = regex.Matches(htmlStr);

        for (int i = 0; i < mc.Count; i++) //存放匹配的集合

        {

            bool hasExist = false;   //链接存在与否的标记

            String name = mc[i].ToString();

            foreach (String one in list)

            {

                if (name == one)

                {

                    hasExist = true; //链接已存在

                    break;

                }

            }

            if (!hasExist) list.Add(name); //链接不存在，添加

        }

        return list;

    }

    //根据内容--取得body内的内容

    public static string getBody(string url)

    {

        string htmlStr = getHtml(url);

        string result = string.Empty;

        Regex regBody = new Regex(@"(?is)<body[^>]*>(?:(?!</?body\b).)*</body>");

        Match m = regBody.Match(htmlStr);

        if (m.Success)

        {

            result = parseHtml(m.Value);

        }

        return result;

    }

    //获取所有图片

    public static List<string> getImg(string url)

    {

        List<string> list = new List<string>();

        string temp = string.Empty;

        string htmlStr = getHtml(url);

        MatchCollection matchs = Regex.Matches(htmlStr, @"<(IMG|img)[^>]+>"); //抽取所有图片

        for (int i = 0; i < matchs.Count; i++)

        {

            list.Add(matchs[i].Value);

        }

        return list;

    }

    //所有图片路径(如果是相对路径的话，自动设置成绝对路径)

    public static List<string> getImgPath(string url)

    {

        List<string> list = new List<string>();

        string htmlStr = getHtml(url);

        string pat = @"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>";

        MatchCollection matches = Regex.Matches(htmlStr, pat, RegexOptions.IgnoreCase | RegexOptions.Multiline);

        foreach (Match m in matches)

        {

            string imgPath = m.Groups["imgUrl"].Value.Trim();

            if (Regex.IsMatch(imgPath, @"\w+\.(gif|jpg|bmp|png)$")) //用了2次匹配，去除链接是网页的 只留图片

            {

                if (!imgPath.Contains("http"))//必须包含http 否则无法下载

                {

                    imgPath = getUrl(url) + imgPath;

                }

                list.Add(imgPath);

            }

        }

        return list;

    }

    //下载图片

    public void DownloadImg(string fileurl)

    {

        if (fileurl.Contains('.'.ToString()))//url路径必须是绝对路径 例如http://xxx.com/img/logo.jpg

        {

            string imgName = DateTime.Now.ToString("yyyyMMddHHmmssffff") + fileurl.Substring(fileurl.LastIndexOf('.')); // 生成图片的名字

            string filepath = System.Web.HttpContext.Current.Server.MapPath("") + "/" + imgName;

            WebClient mywebclient = new WebClient();

            mywebclient.DownloadFile(fileurl, filepath);

        }

    }

    //过滤html

    public static string parseHtml(string html)

    {

        string value = Regex.Replace(html, "<[^>]*>", string.Empty);

        value = value.Replace("<", string.Empty);

        value = value.Replace(">", string.Empty);

        //return value.Replace(" ", string.Empty);

        return Regex.Replace(value, @"\s+", "");

    }

    //处理url路径问题

    public static string getUrl(string url)

    {

        //如果是http://www.xxx.com           返回http://www.xxx.com/

        //如果是http://www.xxx.com/art.aspx  返回http://www.xxx.com/

        return url = url.Substring(0, url.LastIndexOf('/')) + "/";

    }

}

分享一个c#t的网页抓取类的更多相关文章

Python实现简单的网页抓取
现在开源的网页抓取程序有很多,各种语言应有尽有. 这里分享一下Python从零开始的网页抓取过程第一步:安装Python 点击下载适合的版本https://www.python.org/ 我这里选择 ...
Java实现网页抓取的一个Demo
这个小案例的话我是存放在我的github 上. 下面给出链接自己可以去看下,也可以直接下载源码.有具体的说明 <Java网页抓取>
网页抓取：PHP实现网页爬虫方式小结
来源:http://www.ido321.com/1158.html 抓取某一个网页中的内容,需要对DOM树进行解析,找到指定节点后,再抓取我们需要的内容,过程有点繁琐.LZ总结了几种常用的.易于实现 ...
Python selenium自动化网页抓取器
(开开心心每一天~ ---虫瘾师) 直接入正题---Python selenium自动控制浏览器对网页的数据进行抓取,其中包含按钮点击.跳转页面.搜索框的输入.页面的价值数据存储.mongodb自动i ...
java网页抓取
网页抓取就是,我们想要从别人的网站上得到我们想要的,也算是窃取了,有的网站就对这个网页抓取就做了限制,比如百度直接进入正题 //要抓取的网页地址 String urlStr = "http ...
基于Casperjs的网页抓取技术【抓取豆瓣信息网络爬虫实战示例】
CasperJS is a navigation scripting & testing utility for the PhantomJS (WebKit) and SlimerJS (Ge ...
Python开发爬虫之动态网页抓取篇：爬取博客评论数据——通过Selenium模拟浏览器抓取
区别于上篇动态网页抓取,这里介绍另一种方法,即使用浏览器渲染引擎.直接用浏览器在显示网页时解析 HTML.应用 CSS 样式并执行 JavaScript 的语句. 这个方法在爬虫过程中会打开一个浏览器 ...
Python网络爬虫笔记（一）：网页抓取方式和LXML示例
(一) 三种网页抓取方法 1. 正则表达式: 模块使用C语言编写,速度快,但是很脆弱,可能网页更新后就不能用了. 2. Beautiful Soup 模块使用Python编写,速度慢. ...
Python爬虫之三种网页抓取方法性能比较
下面我们将介绍三种抓取网页数据的方法,首先是正则表达式,然后是流行的 BeautifulSoup 模块,最后是强大的 lxml 模块. 1. 正则表达式如果你对正则表达式还不熟悉,或是需要一些提 ...

随机推荐

Build to win！——获得小黄衫的感想
UPDATE: 应栋哥要求,上传了无遮挡的正面照(我的内心其实是拒绝的!(ㄒoㄒ)) 一.前言&背景从大一上C++课程开始,栋哥就开始安利他大三的软工实践课. 时间过得飞快,大学转眼就过去一 ...
jQuery知识点总结（第六天）
今天工作繁忙,晚上又和所谓的'朋友',吃了自助烧烤. 但我内心是很抗拒的,不知为了什么,竟然稀奇古怪的答应了下来,竟要去吃饭.我向来不喜欢去凑热闹,特别是和志趣不投的人在一起吃,对方所说的话,自己根本 ...
mybatis 传递参数的方法总结
有三种mybatis传递参数的方式: 第一种 mybatis传入参数是有序号的,可以直接用序号取得参数 User selectUser(String name,String area); 可以在xml ...
jquery ajax rest invoke
notice: <script type="text/javascript"> $(document).ready(function() { $("#b03& ...
Wget下载终极用法和15个详细的例子
Wget下载终极用法和15个详细的例子备注:wget 不支持https 下载,也没有相关https参数,当下载https的时候或以改用 axelWget是一种很好用的因特网下载工具,他具有的很多特 ...
ecshop团购显示“库存不足”
产生原因:是因为产品设置了多属性解决办法:打开group_buy.php 第 267行找到 empty($product_info) ? $product_info = array(, ) : '' ...
Django基础，Day10 - template 模板引擎与路径设置
作为一个Web框架,Django需要一个方便的方式来生成动态的HTML.最常见的方法依赖于模板.模板包含所需的HTML输出的静态部分以及一些特殊的语法描述如何插入动态内容. Django框架后端默认支 ...
MVC下分页的自定义分页一种实现
1.引言在MVC开发中我们经常会对数据进行分页的展示.通过分页我们可以从服务端获取指定的数据来进行展示.这样既节约了数据库查询的时间也节约了网络传输的数据量.在MVC开发中使用的比较多的应该是MVC ...
[asp.net core]project.json（2）
摘要上篇文章介绍了project.json中的一部分属性.属性真的比较多,所以分开了,考虑到其中的英文比较简单,也不再进行翻译了,从英文原文中,直接粘贴过来了. project.json(1) pr ...
firefox的plugin-container.exe进程如何关闭?
为什么要关闭container进程? 查看firefox所消耗的资源: ff本身: cpu一般是0-10%, 内存一般是400MB左右 plugin-container: cpu所占的比例很高, 可达 ...

分享一个c#t的网页抓取类

分享一个c#t的网页抓取类的更多相关文章

随机推荐

热门专题