【C#爬虫】抓取XX网站mp4资源地址
抓取小视频的url地址,然后将地址信息拷贝到迅雷里批量下载就ok了
主程序 代码
//yazhouqingseAV 35
//zhifusiwaAV 29
//zipaishipin 30
//oumeiqingseAV 28
//katongdongman 31
//tongxingAV 32
//sanjidianying 33
//fengkuangqunjiao 34 var client = new WinHttpHelper();
var type = "fengkuangqunjiao";
var classid = ; for (int i = ; i > -; i++)
{
Console.WriteLine(i);
var index = "_" + i;
if (i == )
index = ""; string pageUrl = "http://www.lang34.com/se/" + type + "/index" + index + ".html"; var trs = RegexHelper.GetMathList(client.GET(pageUrl, Encoding.UTF8), "" + type + "/(.*?).html");
foreach (var item in trs)
{
string temp = "";
if (RegexHelper.GetMatchStr(item.ToString(), "" + type + "/(.*?).html", true, out temp))
{
string url = "http://www.lang34.com/e/DownSys/play/?classid=" + classid + "&id=" + temp + "&pathid=0";
string htmltext = client.GET(url, Encoding.UTF8); string mp4 = "";
if (RegexHelper.GetMatchStr(htmltext, "f:'(.*?)',", true, out mp4))
{
string titile = "";
RegexHelper.GetMatchStr(htmltext, " <title>(.*?)</title>", true, out titile); string output = mp4 + "?title" + titile + "\r\n";
Console.WriteLine(output);
File.AppendAllText("D://" + type + ".txt", output);
}
} }
}
网络请求类
using System;
using System.Collections.Generic;
using System.Text; namespace MyHelper4Web
{
public class WinHttpHelper
{
WinHttp.WinHttpRequest request; public string Accept = "*/*";
public string UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET4.0C; InfoPath.2; .NET4.0E)";
public string ContentType = "application/json";// "application/x-www-form-urlencoded";
public int SetTimeOut = ;//请求超时时间秒
public bool AllowAutoRedirect = true;//是否允许自动跳转
public bool AllowHttpstoHttp = false;//是否允许http与https转换 public WinHttpHelper()
{
request = new WinHttp.WinHttpRequest();
} /// <summary>
/// 传入请求头的HttpHelper构造函数
/// </summary>
/// <param name="Accept">Accept</param>
/// <param name="UserAgent">UserAgent</param>
/// <param name="ContentType">ContentType</param>
public WinHttpHelper(string Accept, string UserAgent, string ContentType)
{
this.Accept = Accept;
this.UserAgent = UserAgent;
this.ContentType = ContentType;
} /// <summary>
/// 传入请求头的HttpHelper构造函数
/// </summary>
/// <param name="Accept">Accept</param>
/// <param name="UserAgent">UserAgent</param>
/// <param name="ContentType">ContentType</param>
/// <param name="SetTimeOut">SetTimeOut</param>
public WinHttpHelper(string Accept, string UserAgent, string ContentType, int SetTimeOut)
{
this.Accept = Accept;
this.UserAgent = UserAgent;
this.ContentType = ContentType;
this.SetTimeOut = SetTimeOut;
} /// <summary>
/// GET方式请求网页
/// </summary>
/// <param name="Url">请求的url</param>
/// <returns>以字节数组形式返回响应内容</returns>
public byte[] GET(string Url,string refer)
{
byte[] responsebody;
try
{
//不允许自动跳转
if (AllowAutoRedirect == false)
{
request.set_Option(WinHttp.WinHttpRequestOption.WinHttpRequestOption_EnableRedirects, false);
}
//允许https与http转换
if (AllowHttpstoHttp == true)
{
request.set_Option(WinHttp.WinHttpRequestOption.WinHttpRequestOption_EnableHttpsToHttpRedirects, true);
}
request.Open("GET", Url, true);
request.SetRequestHeader("Accept", Accept);
request.SetRequestHeader("User-Agent", UserAgent);
if (!string.IsNullOrEmpty(refer))
{
request.SetRequestHeader("Referer", refer);
}
request.Send("");
request.WaitForResponse(SetTimeOut);
responsebody = (byte[])request.ResponseBody;
}
catch (Exception ex)
{
responsebody = Encoding.Default.GetBytes(ex.Message + ex.Source);
////LogHelper.Log.Error("GET方式请求网页异常", ex);
}
return responsebody;
} /// <summary>
/// GET方式请求网页
/// </summary>
/// <param name="Url">请求的url</param>
/// <param name="Encode">转换字符串用的编码</param>
/// <returns>以字符串形式返回响应内容</returns>
public string GET(string Url, Encoding Encode)
{
string htmltext = "";
try
{
byte[] htmlbyte = GET(Url,"");
htmltext = Encode.GetString(htmlbyte);
}
catch (Exception ex)
{
htmltext = ex.Message + ex.Source;
////LogHelper.Log.Error("GET方式请求网页异常", ex);
}
return htmltext;
} public string GET(string Url,string refer , Encoding Encode)
{
byte[] htmlbyte = GET(Url, refer); return Encode.GetString(htmlbyte);
} /// <summary>
/// POST方式请求网页
/// </summary>
/// <param name="Url">请求的Url</param>
/// <param name="PostData">请求传的值</param>
/// <param name="Refer">Refer</param>
/// <returns>以字节数组形式返回响应内容</returns>
public byte[] POST(string Url, string PostData, string Refer)
{
byte[] responsebody;
try
{
//不允许自动跳转
if (AllowAutoRedirect == false)
{
request.set_Option(WinHttp.WinHttpRequestOption.WinHttpRequestOption_EnableRedirects, false);
}
//允许https与http转换
if (AllowHttpstoHttp == true)
{
request.set_Option(WinHttp.WinHttpRequestOption.WinHttpRequestOption_EnableHttpsToHttpRedirects, true);
}
request.Open("POST", Url, true);
request.SetRequestHeader("Accept", Accept);
request.SetRequestHeader("User-Agent", UserAgent);
request.SetRequestHeader("Content-Type", ContentType);
if (!string.IsNullOrEmpty(Refer))
{
request.SetRequestHeader("Referer", Refer);
}
request.Send(PostData);
request.WaitForResponse(SetTimeOut);
responsebody = (byte[])request.ResponseBody;
}
catch (Exception ex)
{
responsebody = Encoding.Default.GetBytes(ex.Message + ex.Source);
////LogHelper.Log.Error("POST方式请求网页异常", ex);
}
return responsebody;
} /// <summary>
/// POST方式请求网页
/// </summary>
/// <param name="Url">请求的Url</param>
/// <param name="PostData">请求传的值</param>
/// <returns>以字节数组形式返回响应内容</returns>
public byte[] POST(string Url, string PostData)
{
byte[] responsebody;
responsebody = POST(Url, PostData, "");
return responsebody;
} /// <summary>
/// POST方式请求网页
/// </summary>
/// <param name="Url">请求的Url</param>
/// <param name="PostData">请求传的值</param>
/// <param name="Refer">Refer</param>
/// <param name="Encode">转换字符串用的编码</param>
/// <returns>以字符串形式返回响应内容</returns>
public string POST(string Url, string PostData, string Refer, Encoding Encode)
{
string htmltext = string.Empty;
try
{
byte[] responsebody = POST(Url, PostData, Refer);
htmltext = Encode.GetString(responsebody);
}
catch (Exception ex)
{
htmltext = ex.Message + ex.Source;
////LogHelper.Log.Error("POST方式请求网页异常", ex);
}
return htmltext;
} /// <summary>
/// POST方式请求网页
/// </summary>
/// <param name="Url">请求的Url</param>
/// <param name="PostData">请求传的值</param>
/// <param name="Encode">转换字符串用的编码</param>
/// <returns>以字符串形式返回响应内容</returns>
public string POST(string Url, string PostData, Encoding Encode)
{
string htmltext = string.Empty;
try
{
byte[] responsebody = POST(Url, PostData, "");
htmltext = Encode.GetString(responsebody);
}
catch (Exception ex)
{
htmltext = ex.Message + ex.Source;
////LogHelper.Log.Error("POST方式请求网页异常", ex);
}
return htmltext;
} public string GetAllCookis()
{
string cookis = "";
try
{
cookis = request.GetAllResponseHeaders();
}
catch (Exception)
{
return "";
}
return cookis;
}
}
}
正则表达式类
using System;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;
using System.Collections; namespace MyHelper4Web
{
public class RegexHelper
{
/// <summary>
///
/// </summary>
/// <param name="htmltext"></param>
/// <param name="pattern"></param>
/// <param name="isCut"></param>
/// <param name="result"></param>
/// <returns></returns>
public static bool GetMatchStr(string htmltext, string pattern, bool isCut, out string result)
{
bool IsGetSuccess = false;
result = "";
try
{
IsGetSuccess = GetMatchStr(htmltext, pattern, out result);
if (!isCut)
{
string[] replaceStrs = new string[];
if (pattern.Contains("(.*?)"))
{
string splitStr = pattern.Replace("(.*?)", "|");
replaceStrs = splitStr.Split('|');
}
result = replaceStrs[] + result + replaceStrs[];
}
}
catch (Exception ex)
{
IsGetSuccess = false;
} return IsGetSuccess;
} public static string GetMatchString(string htmltext, string pattern, bool isCut)
{
string result = "";
try
{
GetMatchStr(htmltext, pattern, out result);
if (isCut)
{
string[] replaceStrs = new string[];
if (pattern.Contains("(.*?)"))
{
string splitStr = pattern.Replace("(.*?)", "|");
replaceStrs = splitStr.Split('|');
}
result = result.Replace(replaceStrs[], "").Replace(replaceStrs[], "");
}
return result;
}
catch (Exception ex)
{
return "";
} } /// <summary>
/// 正则表达式dan匹配方法
/// </summary>
/// <param name="htmltext">网页内容</param>
/// <param name="pattern">模式字符串</param>
/// <param name="result">返回匹配成功的字符串</param>
/// <returns>匹配是否成功</returns>
public static bool GetMatchStr(string htmltext, string pattern, out string result)
{
bool IsGetSuccess = false;
result = "";
try
{
string[] replaceStrs=new string[];
if (pattern.Contains("(.*?)"))
{
string splitStr = pattern.Replace("(.*?)", "^");
replaceStrs = splitStr.Split('^');
}
Regex regex = new Regex(pattern, RegexOptions.Singleline | RegexOptions.IgnoreCase);
Match match = regex.Match(htmltext);
if (match.Success)
{
result = match.ToString();
result = result.Replace(replaceStrs[], "").Replace(replaceStrs[], "");
}
else
{
IsGetSuccess = false;
}
}
catch (Exception ex)
{
IsGetSuccess = false;
}
finally
{
if (!string.IsNullOrEmpty(result))
{
IsGetSuccess = true;
}
else
{
IsGetSuccess = false;
}
}
return IsGetSuccess;
} /// <summary>
/// 正则多匹配,返回匹配ArrayList数组
/// </summary>
/// <param name="htmltext">网页内容</param>
/// <param name="pattern">模式字符串</param>
/// <returns></returns>
public static ArrayList GetMathList(string htmltext, string pattern)
{
ArrayList list = new ArrayList();
try
{
MatchCollection mc;
//定义一个Regex对象实例
Regex regex = new Regex(pattern, RegexOptions.Singleline | RegexOptions.IgnoreCase);
//或者多行匹配模式RegexOptions.Multiline
mc = regex.Matches(htmltext);
//在输入字符串中找到所有匹配
for (int i = ; i < mc.Count; i++)
{
//匹配一条信息就处理
string groupcode = mc[i].Value.ToString();
//处理函数
list.Add(groupcode);
}
}
catch (Exception)
{
return null;
}
return list;
} ///// <summary>
///// 正则表达式duo匹配方法
///// </summary>
///// <param name="htmltext">网页内容</param>
///// <param name="patterns">模式字符串数组</param>
///// <param name="result">返回匹配成功的字符串</param>
///// <returns>匹配是否成功</returns>
//public static bool GetMathStr(string htmltext, string[] patterns, out string result)
//{
// bool IsGetSuccess = false;
// result = "";
// try
// {
// string temp = htmltext;
// for (int i = 0; i < patterns.Length; i++)
// {
// Regex regex = new Regex(patterns[i], RegexOptions.Singleline | RegexOptions.IgnoreCase);
// Match match = regex.Match(temp);
// if (match.Success)
// {
// temp = match.ToString();
// if (i == patterns.Length - 1)
// {
// result = temp;
// }
// }
// else
// {
// break;
// }
// }
// }
// catch (Exception ex)
// {
// IsGetSuccess = false;
// }
// finally
// {
// if (!string.IsNullOrEmpty(result))
// {
// IsGetSuccess = true;
// }
// else
// {
// IsGetSuccess = false;
// }
// }
// return IsGetSuccess;
//}
}
}
【C#爬虫】抓取XX网站mp4资源地址的更多相关文章
- python爬虫 抓取一个网站的所有网址链接
sklearn实战-乳腺癌细胞数据挖掘 https://study.163.com/course/introduction.htm?courseId=1005269003&utm_campai ...
- python爬虫--爬取某网站电影下载地址
前言:因为自己还是python世界的一名小学生,还有很多路要走,所以本文以目的为向导,达到目的即可,对于那些我自己都没弄懂的原理,不做去做过多解释,以免误人子弟,大家可以网上搜索. 友情提示:本代码用 ...
- 一个简单的scrapy爬虫抓取豆瓣刘亦菲的图片地址
一.第一步是创建一个scrapy项目 sh-3.2# scrapy startproject liuyifeiImage sh-3.2# chmod -R 777 liuyifeiImage/ 二.分 ...
- 爬虫抓取页面数据原理(php爬虫框架有很多 )
爬虫抓取页面数据原理(php爬虫框架有很多 ) 一.总结 1.php爬虫框架有很多,包括很多傻瓜式的软件 2.照以前写过java爬虫的例子来看,真的非常简单,就是一个获取网页数据的类或者方法(这里的话 ...
- python 爬虫抓取心得
quanwei9958 转自 python 爬虫抓取心得分享 urllib.quote('要编码的字符串') 如果你要在url请求里面放入中文,对相应的中文进行编码的话,可以用: urllib.quo ...
- C# 爬虫 抓取小说
心血来潮,想研究下爬虫,爬点小说. 通过百度选择了个小说网站,随便找了一本小书http://www.23us.so/files/article/html/13/13655/index.html. 1. ...
- Java 实现 HttpClients+jsoup,Jsoup,htmlunit,Headless Chrome 爬虫抓取数据
最近整理一下手头上搞过的一些爬虫,有HttpClients+jsoup,Jsoup,htmlunit,HeadlessChrome 一,HttpClients+jsoup,这是第一代比较low,很快就 ...
- PID控制器的应用:控制网络爬虫抓取速度
一.初识PID控制器 冬天乡下人喜欢烤火取暖,常见的情形就是四人围着麻将桌,桌底放一盆碳火.有人觉得火不够大,那加点木炭吧,还不够,再加点.片刻之后,又觉得火太大,脚都快被烤熟了,那就取出一些木碳…… ...
- Python爬虫抓取东方财富网股票数据并实现MySQL数据库存储
Python爬虫可以说是好玩又好用了.现想利用Python爬取网页股票数据保存到本地csv数据文件中,同时想把股票数据保存到MySQL数据库中.需求有了,剩下的就是实现了. 在开始之前,保证已经安装好 ...
随机推荐
- Java多线程——线程同步
在之前,已经学习到了线程的创建和状态控制,但是每个线程之间几乎都没有什么太大的联系.可是有的时候,可能存在多个线程多同一个数据进行操作,这样,可能就会引用各种奇怪的问题.现在就来学习多线程对数据访问的 ...
- SQL读取系统时间的语法(转)
--获取当前日期(如:yyyymmdd) select CONVERT (nvarchar(12),GETDATE(),112) --获取当前日期(如:yyyymmdd hh:MM:ss)select ...
- sql数据库之间数据的转录
private void Form1_Load(object sender, EventArgs e) { BindDataBase(combDataBaseNew, , ""); ...
- mysql_fetch_assoc() ,mysql_fetch_array() , mysql_fetch_row()的区别
1. mysql_fetch_assoc() 函数从结果集中取得一行作为关联数组. 返回根据从结果集取得的行生成的关联数组,如果没有更多行,则返回 false. 输出: Array ( [LastNa ...
- Linq中字段数据类型转换问题(Linq to entity,LINQ to Entities 不识别方法"System.String ToString()"问题解决)
1.在工作中碰到这样一个问题: 使用linq时,需要查询两个表,在这两张表中关联字段分别是int,和varchar()也就是string,在linq中对这两个字段进行关联, 如果强制类型转换两个不同类 ...
- rails 中 create, new, build, save 的用法以及误区汇总
自己很初级,初级的不能再初级,所以初次接触rails的时候,对于里面的create,new,build等方法不是很了解,用的很混乱,导致经常出现不必要的bug,很苦恼,决定,总结一下,结合网上已有资源 ...
- jQuery上传插件Uploadify 3.2使用
Uploadify下载地址:http://www.uploadify.com/download/ 这里下载最新版的3.2的. 常用API描述: $(document).ready(function() ...
- phpcms v9 读取地区联动菜单缓存文件
读取缓存文件的方法是 getcache() 在 phpcms\libs\functions\global.func.php 中可找到. 地区联动菜单的缓存文件是 caches\caches_link ...
- IO通信
DeviceIoControl 发送控制代码到指定设备驱动程序 参数解释: hDevice Long,设备句柄 dwIoControlCode Long,应用程序调用驱动程序的控制命令,就是IOCTL ...
- android弹出式菜单、弹出式对话框、弹出式窗口
http://www.open-open.com/lib/view/open1389767042601.html http://www.open-open.com/lib/view/open13321 ...