UI_Less.pas:

 unit UI_Less;

 interface

 uses
Windows, Classes, Messages, Forms, MsHtml, Urlmon, ActiveX; const
WM_USER_STARTWALKING = WM_USER + ;
DISPID_AMBIENT_DLCONTROL = (-);
READYSTATE_COMPLETE = $; DLCTL_DLIMAGES = $;
DLCTL_VIDEOS = $;
DLCTL_BGSOUNDS = $;
DLCTL_NO_SCRIPTS = $;
DLCTL_NO_JAVA = $;
DLCTL_NO_RUNACTIVEXCTLS = $;
DLCTL_NO_DLACTIVEXCTLS = $;
DLCTL_DOWNLOADONLY = $;
DLCTL_NO_FRAMEDOWNLOAD = $;
DLCTL_RESYNCHRONIZE = $;
DLCTL_PRAGMA_NO_CACHE = $;
DLCTL_NO_BEHAVIORS = $;
DLCTL_NO_METACHARSET = $;
DLCTL_URL_ENCODING_DISABLE_UTF8 = $;
DLCTL_URL_ENCODING_ENABLE_UTF8 = $;
DLCTL_FORCEOFFLINE = $;
DLCTL_NO_CLIENTPULL = $;
DLCTL_SILENT = $;
DLCTL_OFFLINEIFNOTCONNECTED = $;
DLCTL_OFFLINE = DLCTL_OFFLINEIFNOTCONNECTED; type
TUILess = class(TComponent, IUnknown, IDispatch, IPropertyNotifySink,
IOleClientSite)
private
FDocTitle: string;
FBodyText: TStrings;
FBodyHtml: TStrings;
protected
/// IDISPATCH
function Invoke(DispID: Integer; const IID: TGUID; LocaleID: Integer;
Flags: Word; var Params; VarResult, ExcepInfo, ArgErr: Pointer): HResult;
stdcall;
/// IPROPERTYNOTIFYSINK
function OnChanged(DispID: TDispID): HResult; stdcall;
function OnRequestEdit(DispID: TDispID): HResult; stdcall;
/// IOLECLIENTSITE
function SaveObject: HResult; stdcall;
function GetMoniker(dwAssign: Longint; dwWhichMoniker: Longint;
out mk: IMoniker): HResult; stdcall;
function GetContainer(out container: IOleContainer): HResult; stdcall;
function ShowObject: HResult; stdcall;
function OnShowWindow(fShow: BOOL): HResult; stdcall;
function RequestNewObjectLayout: HResult; stdcall;
///
function LoadUrlFromMoniker: HResult;
function LoadUrlFromFile: HResult;
// * We only use LoadUrlFromMoniker, but we could use LoadUrlFromFile instead. public
constructor Create(AOwner: TComponent); override;
destructor Destroy; override;
property DocTitle: string read FDocTitle;
property BodyText: TStrings read FBodyText write FBodyText;
property BodyHtml: TStrings read FBodyHtml write FBodyHtml;
function Get(URL: PWidechar; var IsSucceed: Boolean; IsStop: Boolean)
: IHTMLELEMENTCollection;
procedure GetAnchorList(IC: IHTMLELEMENTCollection; Anchorlist: TStrings);
procedure GetImageList(IC: IHTMLELEMENTCollection; ImageList: TStrings);
end; implementation var
Doc: IhtmlDocument2;
_URL: PWidechar; constructor TUILess.Create(AOwner: TComponent);
begin
inherited Create(AOwner);
FBodyText := TStringList.Create;
FBodyHtml := TStringList.Create;
end; destructor TUILess.Destroy;
begin
if Assigned(FBodyText) then
FBodyText.Free;
if Assigned(FBodyHtml) then
FBodyHtml.Free;
inherited Destroy;
end; /// CORE ---->>>>>>>>>
function TUILess.Get(URL: PWidechar; var IsSucceed: Boolean; IsStop: Boolean)
: IHTMLELEMENTCollection;
var
Cookie: Integer;
CP: IConnectionPoint;
OleObject: IOleObject;
OleControl: IOleControl;
CPC: IConnectionPointContainer;
All: IHTMLElement;
Msg: TMsg;
hr: HResult;
begin
_URL := URL;
IsSucceed := false;
try
CoCreateInstance(CLASS_HTMLDocument, nil, CLSCTX_INPROC_SERVER,
IID_IHTMLDocument2, Doc);
OleObject := Doc as IOleObject;
OleObject.SetClientSite(self);
OleControl := Doc as IOleControl;
OleControl.OnAmbientPropertyChange(DISPID_AMBIENT_DLCONTROL);
CPC := Doc as IConnectionPointContainer;
CPC.FindConnectionPoint(IPropertyNotifySink, CP);
CP.Advise(self, Cookie);
hr := LoadUrlFromMoniker; // alternative: Hr:= LoadUrlFromFile;
if ((SUCCEEDED(hr)) or (hr = E_PENDING)) then
while (GetMessage(Msg, , , )) do
begin
if ((Msg.message = WM_USER_STARTWALKING) and (Msg.hwnd = )) then
begin
PostQuitMessage();
result := Doc.Get_all;
All := Doc.Get_body;
FDocTitle := string(Doc.nameProp);
FBodyText.Text := string(All.outerText);
FBodyHtml.Text := string(All.outerHTML);
IsSucceed := true;
end
else
DispatchMessage(Msg);
if IsStop then
Exit;
end;
except
Exit;
end;
end; function TUILess.Invoke(DispID: Integer; const IID: TGUID; LocaleID: Integer;
Flags: Word; var Params; VarResult, ExcepInfo, ArgErr: Pointer): HResult;
var
I: Integer;
begin
if DISPID_AMBIENT_DLCONTROL = DispID then
begin
I := DLCTL_DOWNLOADONLY + DLCTL_NO_SCRIPTS + DLCTL_NO_JAVA +
DLCTL_NO_DLACTIVEXCTLS + DLCTL_NO_RUNACTIVEXCTLS;
PVariant(VarResult)^ := I;
result := S_OK;
end
else
result := DISP_E_MEMBERNOTFOUND;
end; function TUILess.OnChanged(DispID: TDispID): HResult;
var
dp: TDispParams;
vResult: OleVariant;
begin
if (DISPID_READYSTATE = DispID) then
if SUCCEEDED((Doc as IhtmlDocument2).Invoke(DISPID_READYSTATE, GUID_null,
LOCALE_SYSTEM_DEFAULT, DISPATCH_PROPERTYGET, dp, @vResult, nil, nil))
then
if Integer(vResult) = READYSTATE_COMPLETE then
PostThreadMessage(GetCurrentThreadId(), WM_USER_STARTWALKING, , );
end; function TUILess.LoadUrlFromMoniker: HResult;
var
Moniker: IMoniker;
BindCtx: IBindCTX;
PM: IPersistMoniker;
begin
createURLMoniker(nil, _URL, Moniker);
CreateBindCtx(, BindCtx);
PM := Doc as IPersistMoniker;
result := PM.Load(LongBool(), Moniker, BindCtx, STGM_READ)
end; function TUILess.LoadUrlFromFile: HResult;
var
PF: IPersistfile;
begin
PF := Doc as IPersistfile;
result := PF.Load(_URL, );
end; // 获取图像链接
procedure TUILess.GetImageList(IC: IHTMLELEMENTCollection; ImageList: TStrings);
var
Image: IHTMLImgElement;
Disp: IDispatch;
x: Integer;
begin
if IC <> nil then
begin
for x := to IC.Length - do
begin
application.ProcessMessages;
Disp := IC.item(x, );
if SUCCEEDED(Disp.QueryInterface(IHTMLImgElement, Image)) then
ImageList.add(string(Image.src));
end;
end;
end; // 获取链接
procedure TUILess.GetAnchorList(IC: IHTMLELEMENTCollection;
Anchorlist: TStrings);
var
anchor: IHTMLAnchorElement;
Disp: IDispatch;
x: Integer;
begin
if IC <> nil then
begin
for x := to IC.Length - do
begin
application.ProcessMessages;
Disp := IC.item(x, );
if (SUCCEEDED(Disp.QueryInterface(IHTMLAnchorElement, anchor)) and
(anchor.href <> '')) then
Anchorlist.add(string(anchor.href));
end;
end;
end; /// Don't Care ------>>>>>>>>>>>
function TUILess.OnRequestEdit(DispID: TDispID): HResult;
begin
result := E_NOTIMPL;
end; function TUILess.SaveObject: HResult;
begin
result := E_NOTIMPL;
end; function TUILess.GetMoniker(dwAssign: Longint; dwWhichMoniker: Longint;
out mk: IMoniker): HResult;
begin
result := E_NOTIMPL;
end; function TUILess.GetContainer(out container: IOleContainer): HResult;
begin
result := E_NOTIMPL;
end; function TUILess.ShowObject: HResult;
begin
result := E_NOTIMPL;
end; function TUILess.OnShowWindow(fShow: BOOL): HResult;
begin
result := E_NOTIMPL;
end; function TUILess.RequestNewObjectLayout: HResult;
begin
result := E_NOTIMPL;
end; end.

Unit3.pas:

 unit Unit3;

 interface

 uses
Windows, Messages, SysUtils, Variants, Classes, Graphics, Controls, Forms,
Dialogs, StdCtrls; type
TForm3 = class(TForm)
Button1: TButton;
Edit1: TEdit;
Memo1: TMemo;
Button2: TButton;
Button3: TButton;
Button4: TButton;
procedure Button1Click(Sender: TObject);
procedure Button2Click(Sender: TObject);
procedure Button3Click(Sender: TObject);
procedure Button4Click(Sender: TObject);
private
{ Private declarations }
procedure into(i: Word);
public
{ Public declarations }
end; var
Form3: TForm3; implementation
uses UI_Less; {$R *.dfm} function DoStrToWideChar(s: string): PWideChar;
var
// s:sting;
pwc: PWidechar;
len: integer;
begin
// s:= 'abcdefg ';
len := length(s) + ;
pwc := AllocMem(len * sizeof(widechar));
stringtowidechar(s, pwc, len);
// showmessage(widechartostring(pwc)); result := pwc;
// FreeMem(pwc);
end; procedure TForm3.into(i: Word);
var
sh: TUILess;
su: boolean; // 是否获取成功
// isstop: boolean; //设全局变量可以中断连接 ,避免出错
surl: PWideChar;
begin
surl := DoStrToWideChar(Trim(Edit1.Text));
sh := TUILess.Create(nil);
try
Memo1.Clear;
case i of
:
sh.GetAnchorList(sh.get(surl, su, False), Memo1.Lines);
:
sh.GetImageList(sh.get(surl, su, False), Memo1.Lines);
:
begin
sh.get(surl, su, False);
Memo1.Lines := sh.BodyText;
end;
:
begin
sh.get(surl, su, False);
Memo1.Lines := sh.BodyHtml;
end;
end;
finally
//sh.Free;
end;
end; procedure TForm3.Button1Click(Sender: TObject);
begin
into();
end; procedure TForm3.Button2Click(Sender: TObject);
begin
into();
end; procedure TForm3.Button3Click(Sender: TObject);
begin
into();
end; procedure TForm3.Button4Click(Sender: TObject);
begin
into();
end; end.

html网页采集的更多相关文章

  1. Hawk 3. 网页采集器

    1.基本入门 1. 原理(建议阅读) 网页采集器的功能是获取网页中的数据(废话).通常来说,目标可能是列表(如购物车列表),或是一个页面中的固定字段(如JD某商品的价格和介绍,在页面中只有一个).因此 ...

  2. Fiddler 网页采集抓包利器

    最近这段时间,网页采集方面的工作做得比较多.用curl技术开发了一个微信文章聚合类产品,把抓取到的数据转换成json格式,并在android端调用json数据接口加以显示:基于weiphp做了一个掌上 ...

  3. Fiddler 网页采集抓包利器__手机app抓包

    用curl技术开发了一个微信文章聚合类产品,把抓取到的数据转换成json格式,并在android端调用json数据接口加以显示: 基于weiphp做了一个掌上头条插件,也是用的网页采集技术:和一个创业 ...

  4. 网页采集利器 phpQuery

    网页采集利器 phpQuery 2012-02-28 11:43:24|  分类: php|举报|字号 订阅     在网页采集的时候,通常都会用到正则表达式.但是有时候对于正则不太好的同学,比如我, ...

  5. 网页采集器-UA伪装

    网页采集器-UA伪装 UA伪装 请求载体身份标识的伪装: User-Agent: 请求载体身份标识,通过浏览器发起的请求,请求载体为浏览器,则该请求的User-Agent为浏览器的身份标识,如果使用爬 ...

  6. 异步网页采集利器CasperJs

    在采集网页中,我们会经常遇到采集一些异步加载页面的网页,我们通常用的httpwebrequest类就采集不到了,这个时候我们通常会采用webbrowser来辅助采集,但是.net下自带的webbrow ...

  7. 简单的网页采集程序(ASP.NET MVC4)

    因为懒人太多,造成现在网页数据采集非常的流行,我也来写个简单的记录一下. 之前写了MVC的基本框架的搭建随笔,后面因为公司太忙,个人感情问题:(,导致不想写了,就写了两篇给删除了,现在就搁浅了, 本人 ...

  8. 史林枫:开源HtmlAgilityPack公共小类库封装 - 网页采集(爬虫)辅助解析利器【附源码+可视化工具推荐】

    做开发的,可能都做过信息采集相关的程序,史林枫也经常做一些数据采集或某些网站的业务办理自动化操作软件. 获取目标网页的信息很简单,使用网络编程,利用HttpWebResponse.HttpWebReq ...

  9. C#网页采集数据的几种方式(WebClient、WebBrowser和HttpWebRequest/HttpWebResponse)

    一.通过WebClient获取网页内容 这是一种很简单的获取方式,当然,其它的获取方法也很简单.在这里首先要说明的是,如果为了实际项目的效率考虑,需要考虑在函数中分配一个内存区域.大概写法如下 //M ...

  10. 网页采集(通过HtmlAgilityPack+XPath)

    有HtmlAgilityPack这个类库可以更方便地对HTML内容进行分析和提取.因此今天特别学习和实践了一下HtmlAgilityPack和XPath,并作下笔记. 1.下载HtmlAgilityP ...

随机推荐

  1. HDU1575-Tr 【矩阵快速幂】(模板题)

    <题目链接> 题目大意: A为一个方阵,则Tr A表示A的迹(就是主对角线上各项的和),现要求Tr(A^k)%9973.  Input 数据的第一行是一个T,表示有T组数据. 每组数据的第 ...

  2. 三篇文章带你极速入门php(二)之迅速搭建php环境

    前言 今天讲一下php在windows,mac,linux上的集成环境搭建,目标是简单快速,环境这个事得对号入座,windows用phpstudy,mac用mamp,linux用lnmp一键安装,直接 ...

  3. SQLite中的FROM子句

    SQLite中的FROM子句 FROM子句从数据库中可以获取到一个或多个源表.源表通常是数据库命名的表,但也可以是视图或子查询.子查询相关的更多详细信息,我们会在后面进行介绍.当获取到多个源表时,JO ...

  4. Python学习——异常处理

    1.异常基础: 在编程过程中为了增加友好性,在程序出现bug时一般不会将错误信息显示给用户,而是现实一个提示的页面 基本语法: try: pass except Exception as e: pri ...

  5. BZOJ.4319.[cerc2008]Suffix reconstruction(后缀数组 构造 贪心)

    题目链接 \(Description\) 给定SA数组,求满足SA[]的一个原字符串(每个字符为小写字母),无解输出-1. \(Solution\) 假设我们现在有suf(SA[j]),要构造suf( ...

  6. [CF1030E]Vasya and Good Sequences

    [CF1030E]Vasya and Good Sequences 题目大意: 给定一个长度为\(n(n\le3\times10^5)\)的数列\(a_i(1\le a_i\le10^{18})\). ...

  7. Struts2标签里面调用java方法

    <s:if test="#session.user.hasPrivilegeByName(name)"> hasPrivilegeByName(name) 为User类 ...

  8. git 变基(无卵用)

    在当前分支执行rebase即可,会将提交的记录变成一条直线 git rebase

  9. 随机查出满足条件的5条数据(tp5)

    随机查出满足条件的5条数据 public function showQuestion() { $data[; $data[ $data['level'] = (int)$data['level']; ...

  10. FMDB使用简介

    转:http://my.oschina.net/youzaiyouzaie/blog/92325 源码地址:https://github.com/ccgus/fmdb 这次要分享的是在iOS中使用SQ ...