C# 抓取网页的img src带参数的图片链接，并下载

using System;

using System.Collections.Generic;

using System.ComponentModel;

using System.Data;

using System.Drawing;

using System.IO;

using System.Linq;

using System.Net;

using System.Text;

using System.Text.RegularExpressions;

using System.Threading;

using System.Windows.Forms;

namespace ImageCollection

{

    public partial class Form1 : Form

    {

        private static string Path = AppDomain.CurrentDomain.BaseDirectory + "img";

        public Form1()

        {

            InitializeComponent();

        }

        private void btnshuaqu_Click(object sender, EventArgs e)

        {

            string url = txturl.Text.Trim();

            if (string.IsNullOrEmpty(url))

            {

                MessageBox.Show("请输入URl");

                return;

            }

            txtimg.AppendText("开始抓取中:\r\n");

            Thread th = new Thread(() => ShuaQu(url)) { IsBackground = true };

            th.Start();

        }

        private void ShuaQu(string url)

        {

            DirectoryInfo di = new DirectoryInfo(Path);

            if (System.IO.Directory.Exists(Path))

            {

                di.Delete(true);

            }

            System.IO.Directory.CreateDirectory(Path);

            string result = WebHttp.HttpGet(url, null, );

            string[] str = GetHtmlImageUrlList(result);

            txtimg.Invoke(new Action(() =>

            {

                txtimg.AppendText("已经获取到数据!"+str.Count() + "\r\n");

            }));

            //建立获取网页标题正则表达式

            String regex = @"<title>.+</title>";

            //返回网页标题

            String title = Regex.Match(result, regex).ToString();

            txttitle.Invoke(new Action(() => {

                txttitle.Text = Regex.Replace(title, @"[\""]+", "");

            }));

            foreach (string s in str)

            {

                Uri u = new Uri(s);

                if (u.Host == "www.xxx.com")

                {

                    Thread downimg = new Thread(() => Get_img(s)) { IsBackground = true };

                    downimg.Start();

                    txtimg.Invoke(new Action(() => {

                        txtimg.AppendText(s + "\r\n");

                    }));

                }

            }

            txtimg.Invoke(new Action(() =>

            {

                txtimg.AppendText("全部抓取完成!\r\n");

            }));

        }

        public void Get_img(string imgpath)

        {

            string[] file = imgpath.Split('?');

            string name = System.IO.Path.GetFileName(file[]);

            WebClient mywebclient = new WebClient();

            mywebclient.DownloadFile(imgpath, Path + @"\" + name);

            //Bitmap img = null;

            //HttpWebRequest req;

            //HttpWebResponse res = null;

            //try

            //{

            //    System.Uri httpUrl = new System.Uri(imgpath);

            //    req = (HttpWebRequest)(WebRequest.Create(httpUrl));

            //    req.Timeout = 180000; //设置超时值10秒

            //    req.UserAgent = "XXXXX";

            //    req.Accept = "XXXXXX";

            //    req.Method = "GET";

            //    res = (HttpWebResponse)(req.GetResponse());

            //    img = new Bitmap(res.GetResponseStream());//获取图片流

            //    img.Save(Path + @"\"+name);//随机名

            //}

            //catch (Exception ex)

            //{

            //    string aa = ex.Message;

            //}

            //finally

            //{

            //    res.Close();

            //}

        }

        /// <summary>

        /// 取得HTML中所有图片的 URL。

        /// </summary>

        /// <param name="sHtmlText">HTML代码</param>

        /// <returns>图片的URL列表</returns>

        private string[] GetHtmlImageUrlList(string sHtmlText)

        {

            // 定义正则表达式用来匹配 img 标签

            Regex regImg = new Regex(@"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>", RegexOptions.IgnoreCase);

            // 搜索匹配的字符串

            MatchCollection matches = regImg.Matches(sHtmlText);

            int i = ;

            string[] sUrlList = new string[matches.Count];

            // 取得匹配项列表

            foreach (Match match in matches)

                sUrlList[i++] = match.Groups["imgUrl"].Value;

            return sUrlList;

        }

    }

}

#region 下载图片到Image

public static Image UrlToImage(string url) {

    WebClient mywebclient = new WebClient();

    byte[] Bytes = mywebclient.DownloadData(url);

    using (MemoryStream ms = new MemoryStream(Bytes)) {

        Image outputImg = Image.FromStream(ms);

        return outputImg;

    }

}

#endregion

C# 抓取网页的img src带参数的图片链接，并下载的更多相关文章

使用wget工具抓取网页和图片成功尝试
使用wget工具抓取网页和图片发表于1年前(2014-12-17 11:29) 阅读(2471) | 评论(14) 85人收藏此文章, 我要收藏赞7 wget 网页抓取图片抓取目录[-] ...
使用wget工具抓取网页和图片及相关工具几个
想保存一些网页,最后找到这 wget 的 shell脚本,虽然不是太理想,亲测可用呢. 使用wget工具抓取网页和图片来源 https://my.oschina.net/freestyletim ...
Java 抓取网页中的内容【持续更新】
背景:前几天复习Java的时候看到URL类,当时就想写个小程序试试,迫于考试没有动手,今天写了下,感觉还不错内容1. 抓取网页中的URL 知识点:Java URL+ 正则表达式 import jav ...
C语言调用curl库抓取网页图片
思路是先用curl抓取网页源码,然后以关键字寻找出图片网址. #include <stdio.h> #include <stdlib.h> #include <str ...
C语言调用curl库抓取网页图片(转)
思路是先用curl抓取网页源码,然后以关键字寻找出图片网址. 范例: #include <stdio.h> #include <stdlib.h> #include < ...
python分布式抓取网页
呵呵,前两节好像和python没多大关系..这节完全是贴代码, 这是我第一次写python,很多地方比较乱,主要就看看逻辑流程吧. 对于编码格式确实搞得我头大..取下来页面不知道是什么编码,所以先找c ...
delphi 7中使用idhttp抓取网页解决假死现象
在delphi 7中使用idhttp抓取网页,造成窗口无反应的假死状态.通过搜索获得两种方法. 1.写在线程中,但是调用比较麻烦 2.使用delphi 提供的idantifreeze(必须安装indy ...
PHP抓取网页图片
<?php set_time_limit(0);//抓取不受时间限制 if($_POST['Submit']=="开始抓取"){ $URL=$_POST['link']; g ...
HttpClient（一）HttpClient抓取网页基本信息
一.HttpClient简介 HttpClient 是 Apache Jakarta Common 下的子项目,可以用来提供高效的.最新的.功能丰富的支持 HTTP 协议的客户端编程工具包, 并且它支 ...

随机推荐

C# LINQ系列：LINQ to DataSet的DataTable操作及 DataTable与Linq相互转换
LINQ to DataSet需要使用System.Core.dll.System.Data.dll和System.Data.DataSetExtensions.dll,在项目中添加引用System. ...
BootStrapTable 文档
文档包含了表格属性.列属性.事件.方法等等. 表格参数表格的参数定义在 jQuery.fn.bootstrapTable.defaults. 名称标签类型默认描述 - d ...
border属性
border 简写属性,用于把针对四个边框的属性设置在一个声明里 border-style 用于元素所有边框的样式,或者单独的为各边框设置样式 border-width 简写属性,用于为元素的所有边框 ...
ubuntu 安装 sublime
1.安装包下载 http://www.sublimetext.com/ 2.解压并移动到/usr/lib/下 tar -xvf Sublime.tar.bz2 mv Sublime /usr/lib/ ...
Android基础——使用Fragment适应不同屏幕和分辨率
最近事情很忙,一个新项目赶着出来,但是很多功能都要重新做,一直在编写代码.Debug.今天因为一个新程序要使用Fragment来做,虽然以前也使用过Fragment,不过没有仔细研究,今天顺道写篇文章 ...
地铁盾构管片姿态测量软件（Excel）
记得11年刚开始从事盾构测量的时候,只知道搬站(倒站),测导线,还有就是测量管片.觉得最麻烦的就是在每个管片上面放个水平长尺,用全站仪测出他的水平位置和高程,但当时是本子记录每个数据,回去在敲到电脑上 ...
JedisConnectionPool scala
/** * Created by lq on 2017/8/29. */ object JedisConnectionPool { val config = new JedisPoolConfig() ...
从零写Java Web框架——实现Ioc依赖注入
大概思路通过读取配置文件,获取框架要加载的包路径:base-package,类似于 Spring 配置文件中的: <context:component-scan base-package=&q ...
【Unity笔记】寻路导航Navigation中的区域Areas与消耗Cost
Navigation寻路导航界面下,Areas分页下是在给导航区域分类(相当于分层),以及为每个分类设置不同的消费Cost,意义在于,导航算法中会计算出的是累加起来消耗最低的路径(不一定是视觉上最短可 ...
Java编程的逻辑 (70) - 原子变量和CAS
本系列文章经补充和完善,已修订整理成书<Java编程的逻辑>,由机械工业出版社华章分社出版,于2018年1月上市热销,读者好评如潮!各大网店和书店有售,欢迎购买,京东自营链接:http: ...

C# 抓取网页的img src带参数的图片链接，并下载

C# 抓取网页的img src带参数的图片链接，并下载的更多相关文章

随机推荐

热门专题