用正则表达式抓取网页中的ul 和 li标签中最终的值！

获取你要抓取的页面

const string URL = "http://www.hn3ddf.gov.cn/price/GetList.html?pageno=1";
            string htmlStr = null;
            for (int i = 0; i < 10; i++)
            {
                try
                {
                    System.Net.HttpWebRequest request = (System.Net.HttpWebRequest)System.Net.WebRequest.Create(URL);
                    request.Headers.Set("Pragma", "no-cache");
                    request.Timeout = 10000 + (i * 5000);
                    System.Net.HttpWebResponse response = (System.Net.HttpWebResponse)request.GetResponse();
                    System.IO.Stream streamReceive = response.GetResponseStream();
                    System.IO.StreamReader streamReader = new System.IO.StreamReader(streamReceive, Encoding.GetEncoding("utf-8"));
                    htmlStr = streamReader.ReadToEnd();
                    break;
                }
                catch (Exception e)
                {
                    //----------------抓取异常！！
                }
            }

//抓取页面中的ul 标签中的特定一行属性

MatchCollection priceList = Regex.Matches(htmlStr, @"<ul style=""font-size:12px;width:320px; margin:0; padding:0;"">(.*?)</ul>", RegexOptions.Singleline);

            StringBuilder resultStr = new StringBuilder();
            for (int i = 0; i < priceList.Count; i++)
            {
                try
                {
                      //<ul style="font-size:12px;width:320px; margin:0; padding:0;">
                      // <li style="color:#555555; float:left; display:block; width:140px; height:22px; line-height:22px;" align="center">铔嬮浮閰嶅悎楗叉枡</li>
                      // <li align="center" style="color:#555555; float:left; display:block; width:100px; height:22px; line-height:22px;">2.83鍏?鍗冨厠</li>
                      // <li style="color:#555555; float:left; display:block; width:50px;text-align:center; height:22px; line-height:22px;">05-21</li>
                      //</ul>

//List<string> list = new List<string>();   //放结果的泛型集合
                    //string splitStr = "</li>";
                    //string[] strArray = priceList[i].Value.Split(splitStr.ToArray());    //一组一组的li标签
                    //foreach (string item in strArray)
                    //{
                    //    int first = item.IndexOf('>');
                    //    int last = item.IndexOf("</li>");
                    //    list.Add(item.Substring(first, last - first));
                    //    //list.add(item.substring(item.indexof(">")));
                    //}
                    //MatchCollection items = Regex.Matches(htmlStr, @"<li.*(?=>)(.|\n)*?</li>");

resultStr.Append("<tr>");

//<li style="color:#555555; float:left; display:block; width:140px; height:22px; line-height:22px;" align="center">蛋鸡配合饲料</li>

//<ul style="font-size:12px;width:320px; margin:0; padding:0;">
                    //    <li style="color:#555555; float:left; display:block; width:140px; height:22px; line-height:22px;" align="center">蛋鸡配合饲料</li>
                    //    <li align="center" style="color:#555555; float:left; display:block; width:100px; height:22px; line-height:22px;">2.83元/千克</li>
                    //    <li style="color:#555555; float:left; display:block; width:50px;text-align:center; height:22px; line-height:22px;">05-21</li>
                    //</ul>
                    string priceItem = priceList[i].Value;
                    //string name = Regex.Match(priceItem, @"<li style=""color:#555555; float:left; display:block; width:140px; height:22px; line-height:22px;"" align=""center"">(.*?)</li>").Value;

//配备<开头的在抓取的网页中的li标签中的所有属性进行配备为真的一行结果包含：样式和值
Match TitleMatch = Regex.Match(priceItem, @"<li style=""color:#555555; float:left; display:block; width:140px; height:22px; line-height:22px;"" align=""center"">([^<]*)</li>", RegexOptions.IgnoreCase | RegexOptions.Multiline);
　　　　　　　//取上面一行中的只有属性的值Value.Groups[1],1 代表Regex.Match方法得到的Groups的索引是从1开始的，而不是从0开始的
string name = TitleMatch.Groups[1].Value;

//"color:#555555; float:left; display:block; width:140px; height:22px; line-height:22px;" align="center">铔嬮浮閰嶅悎楗叉枡
//name = name.Substring(10, name.Length - 15);
//name = name.Substring(113, name.Length - 118);

//string price = Regex.Match(priceItem, @"<li align=""center"" style=""color:#555555; float:left; display:block; width:100px; height:22px; line-height:22px;"">(.*?)</li>").Value;
                    //price = price.Substring(13, price.Length - 18);
                    //price = price.Substring(115, price.Length -120);
                    Match priceMatch = Regex.Match(priceItem, @"<li align=""center"" style=""color:#555555; float:left; display:block; width:100px; height:22px; line-height:22px;"">([^<]*)</li>", RegexOptions.IgnoreCase | RegexOptions.Multiline);
                    string price = priceMatch.Groups[1].Value;
//                    string weeks = Regex.Match(priceItem, @"<li style=""color:#555555; float:left; display:block; width:50px;text-align:center; height:22px; line-height:22px;"">(.*?)</li>
//").Value;
//                    //weeks = weeks.Substring(9, weeks.Length - 16);
//                    weeks = weeks.Substring(116, weeks.Length - 122);

Match weeksMatch = Regex.Match(priceItem, @"<li style=""color:#555555; float:left; display:block; width:50px;text-align:center; height:22px; line-height:22px;"">([^<]*)</li>", RegexOptions.IgnoreCase | RegexOptions.Multiline);
                    string weeks = weeksMatch.Groups[1].Value;
                    resultStr.Append("<td width=\"195\" height=\"25\" align=\"left\">" + name + "</td><td width=\"70\" height=\"25\" align=\"center\" style=\"text-align:right;\">" + price + "</td><td height=\"25\" align=\"center\" style=\"color:#55a8ea;\">" + weeks + "</td>");
                    resultStr.Append("</tr>");
                    #region 原来的
                    //resultStr.Append("<tr>");
                    //string priceItem = priceList[i].Value;
                    //string name = Regex.Match(priceItem, "width=125>.*?</td>").Value;
                    //name = name.Substring(10, name.Length - 15);
                    //string price = Regex.Match(priceItem, "<td width=50.*?</td>").Value;
                    //price = price.Substring(13, price.Length - 18);
                    //string weeks = Regex.Match(priceItem, "class=en>.*?</font>").Value;
                    //weeks = weeks.Substring(9, weeks.Length - 16);
                    //resultStr.Append("<td width=\"195\" height=\"25\" align=\"left\">" + name + "</td><td width=\"70\" height=\"25\" align=\"center\">" + price + "</td><td height=\"25\" align=\"center\" style=\"color:#55a8ea;\">" + weeks + "</td>");
                    //resultStr.Append("</tr>");
                    #endregion
                }
                catch (Exception ex)
                {
                    //Common.Log4netUtil.Log().Error("获取跨域数据错误." + ex.Message);
                }
            }

return resultStr.ToString();

用正则表达式抓取网页中的ul 和 li标签中最终的值！的更多相关文章

正则表达式抓取文件内容中的http链接地址
import java.io.BufferedReader; import java.io.FileInputStream; import java.io.FileNotFoundException; ...
Java 抓取网页中的内容【持续更新】
背景:前几天复习Java的时候看到URL类,当时就想写个小程序试试,迫于考试没有动手,今天写了下,感觉还不错内容1. 抓取网页中的URL 知识点:Java URL+ 正则表达式 import jav ...
Python抓取网页中的图片到本地
今天在网上找了个从网页中通过图片URL,抓取图片并保存到本地的例子: #!/usr/bin/env python # -*- coding:utf- -*- # Author: xixihuang # ...
delphi 7中使用idhttp抓取网页解决假死现象
在delphi 7中使用idhttp抓取网页,造成窗口无反应的假死状态.通过搜索获得两种方法. 1.写在线程中,但是调用比较麻烦 2.使用delphi 提供的idantifreeze(必须安装indy ...
php抓取网页中的内容
以下就是几种常用的用php抓取网页中的内容的方法.1.file_get_contentsPHP代码代码如下:>>>>>>>>>>>&g ...
delphi 7中使用idhttp抓取网页解决假死现象（使用TIdAntiFreezeControl控件）
在delphi 7中使用idhttp抓取网页,造成窗口无反应的假死状态.通过搜索获得两种方法. 1.写在线程中,但是调用比较麻烦 2.使用delphi 提供的idantifreeze(必须安装indy ...
jmeter从上一个请求使用正则表达式抓取Set-Cookie值，在下一个请求中运用
工作中遇到的问题,登录请求,返回的Response Headers中有个参数Set-Cookie,需要抓取这个参数,运用到下一个请求中,见下图: 通过正则表达式抓取Set-Cookie的值,由于该值存 ...
python 解决抓取网页中的中文显示乱码问题
关于爬虫乱码有很多各式各样的问题,这里不仅是中文乱码,编码转换.还包括一些如日文.韩文 .俄文.藏文之类的乱码处理,因为解决方式是一致的,故在此统一说明. 网络爬虫出现乱码的原因源网页编码和爬取下来 ...
python抓取网页中图片并保存到本地
#-*-coding:utf-8-*- import os import uuid import urllib2 import cookielib '''获取文件后缀名''' def get_file ...

随机推荐

百度地图坐标转换API和地图API
利用百度地图的服务将经纬度转换为米单位坐标 using System; using System.Collections.Generic; using System.Linq; using Syste ...
FormSheet式模态视图，点击模态视图外隐藏模态视图的方法
@import url(http://i.cnblogs.com/Load.ashx?type=style&file=SyntaxHighlighter.css);@import url(/c ...
px,dp,dip,sp,in,mm,pt详细分析
px,dp,dip,sp,in,mm,pt详细分析 px :(pixels),屏幕的像素点,不同的设备显示效果相同,一般我们HVGA代表320x480像素,这个用的比较多. dip :(devi ...
Hibernate 体系结构简述
SessionFactory: Hibernate的关键对象,它是单个数据库映射关系经过编译后的内存镜像,同时它是线程安全的.它是生成Session的工厂,本身需要依赖于ConnectionProvi ...
HttpClient post 请求实例
所需jar包: commons-codec-1.3.jar commons-httpclient-3.0.jar commons-logging-1.1.1.jar /** * */ package ...
lamp apache配置虚拟主机
You don't have permission to access /index.php on this server
bat命令学习笔记
1.一般在开始声明 setlocal enabledelayedexpansion 设置本地为延迟扩展.其实也就是:延迟变量,全称延迟环境变量扩展,使得批处理能够感知到变量的动态变化,在运行过程中给变 ...
HDU 4122 Alice's mooncake shop
单调队列,裸的!!坑死了,说好的“All the orders are sorted by the time in increasing order. 呢,我就当成严格上升的序列了,于是就各种错.测试 ...
#include <amp.h>
parallel_for_each(av.extent, [=](concurrency::index<1>idx)restrict(amp) {av[idx] += 1; }); //[ ...
IDE idea 更换项目的JDK步骤
1.如图:

用正则表达式抓取网页中的ul 和 li标签中最终的值！

用正则表达式抓取网页中的ul 和 li标签中最终的值！的更多相关文章

随机推荐

热门专题