html网页采集

html网页采集_编程

 

UI_Less.pas:

html网页采集_编程_02html网页采集_编程_03
  1 unit UI_Less;
  2 
  3 interface
  4 
  5 uses
  6   Windows, Classes, Messages, Forms, MsHtml, Urlmon, ActiveX;
  7 
  8 const
  9   WM_USER_STARTWALKING = WM_USER + 1;
 10   DISPID_AMBIENT_DLCONTROL = (-5512);
 11   READYSTATE_COMPLETE = $00000004;
 12 
 13   DLCTL_DLIMAGES = $00000010;
 14   DLCTL_VIDEOS = $00000020;
 15   DLCTL_BGSOUNDS = $00000040;
 16   DLCTL_NO_SCRIPTS = $00000080;
 17   DLCTL_NO_JAVA = $00000100;
 18   DLCTL_NO_RUNACTIVEXCTLS = $00000200;
 19   DLCTL_NO_DLACTIVEXCTLS = $00000400;
 20   DLCTL_DOWNLOADONLY = $00000800;
 21   DLCTL_NO_FRAMEDOWNLOAD = $00001000;
 22   DLCTL_RESYNCHRONIZE = $00002000;
 23   DLCTL_PRAGMA_NO_CACHE = $00004000;
 24   DLCTL_NO_BEHAVIORS = $00008000;
 25   DLCTL_NO_METACHARSET = $00010000;
 26   DLCTL_URL_ENCODING_DISABLE_UTF8 = $00020000;
 27   DLCTL_URL_ENCODING_ENABLE_UTF8 = $00040000;
 28   DLCTL_FORCEOFFLINE = $10000000;
 29   DLCTL_NO_CLIENTPULL = $20000000;
 30   DLCTL_SILENT = $40000000;
 31   DLCTL_OFFLINEIFNOTCONNECTED = $80000000;
 32   DLCTL_OFFLINE = DLCTL_OFFLINEIFNOTCONNECTED;
 33 
 34 type
 35   TUILess = class(TComponent, IUnknown, IDispatch, IPropertyNotifySink,
 36     IOleClientSite)
 37   private
 38     FDocTitle: string;
 39     FBodyText: TStrings;
 40     FBodyHtml: TStrings;
 41   protected
 42     /// IDISPATCH
 43     function Invoke(DispID: Integer; const IID: TGUID; LocaleID: Integer;
 44       Flags: Word; var Params; VarResult, ExcepInfo, ArgErr: Pointer): HResult;
 45       stdcall;
 46     /// IPROPERTYNOTIFYSINK
 47     function OnChanged(DispID: TDispID): HResult; stdcall;
 48     function OnRequestEdit(DispID: TDispID): HResult; stdcall;
 49     /// IOLECLIENTSITE
 50     function SaveObject: HResult; stdcall;
 51     function GetMoniker(dwAssign: Longint; dwWhichMoniker: Longint;
 52       out mk: IMoniker): HResult; stdcall;
 53     function GetContainer(out container: IOleContainer): HResult; stdcall;
 54     function ShowObject: HResult; stdcall;
 55     function OnShowWindow(fShow: BOOL): HResult; stdcall;
 56     function RequestNewObjectLayout: HResult; stdcall;
 57     ///
 58     function LoadUrlFromMoniker: HResult;
 59     function LoadUrlFromFile: HResult;
 60     // * We only use LoadUrlFromMoniker, but we could use LoadUrlFromFile instead.
 61 
 62   public
 63     constructor Create(AOwner: TComponent); override;
 64     destructor Destroy; override;
 65     property DocTitle: string read FDocTitle;
 66     property BodyText: TStrings read FBodyText write FBodyText;
 67     property BodyHtml: TStrings read FBodyHtml write FBodyHtml;
 68     function Get(URL: PWidechar; var IsSucceed: Boolean; IsStop: Boolean)
 69       : IHTMLELEMENTCollection;
 70     procedure GetAnchorList(IC: IHTMLELEMENTCollection; Anchorlist: TStrings);
 71     procedure GetImageList(IC: IHTMLELEMENTCollection; ImageList: TStrings);
 72   end;
 73 
 74 implementation
 75 
 76 var
 77   Doc: IhtmlDocument2;
 78   _URL: PWidechar;
 79 
 80 constructor TUILess.Create(AOwner: TComponent);
 81 begin
 82   inherited Create(AOwner);
 83   FBodyText := TStringList.Create;
 84   FBodyHtml := TStringList.Create;
 85 end;
 86 
 87 destructor TUILess.Destroy;
 88 begin
 89   if Assigned(FBodyText) then
 90     FBodyText.Free;
 91   if Assigned(FBodyHtml) then
 92     FBodyHtml.Free;
 93   inherited Destroy;
 94 end;
 95 
 96 /// CORE ---->>>>>>>>>
 97 function TUILess.Get(URL: PWidechar; var IsSucceed: Boolean; IsStop: Boolean)
 98   : IHTMLELEMENTCollection;
 99 var
100   Cookie: Integer;
101   CP: IConnectionPoint;
102   OleObject: IOleObject;
103   OleControl: IOleControl;
104   CPC: IConnectionPointContainer;
105   All: IHTMLElement;
106   Msg: TMsg;
107   hr: HResult;
108 begin
109   _URL := URL;
110   IsSucceed := false;
111   try
112     CoCreateInstance(CLASS_HTMLDocument, nil, CLSCTX_INPROC_SERVER,
113       IID_IHTMLDocument2, Doc);
114     OleObject := Doc as IOleObject;
115     OleObject.SetClientSite(self);
116     OleControl := Doc as IOleControl;
117     OleControl.OnAmbientPropertyChange(DISPID_AMBIENT_DLCONTROL);
118     CPC := Doc as IConnectionPointContainer;
119     CPC.FindConnectionPoint(IPropertyNotifySink, CP);
120     CP.Advise(self, Cookie);
121     hr := LoadUrlFromMoniker; // alternative: Hr:= LoadUrlFromFile;
122     if ((SUCCEEDED(hr)) or (hr = E_PENDING)) then
123       while (GetMessage(Msg, 0, 0, 0)) do
124       begin
125         if ((Msg.message = WM_USER_STARTWALKING) and (Msg.hwnd = 0)) then
126         begin
127           PostQuitMessage(0);
128           result := Doc.Get_all;
129           All := Doc.Get_body;
130           FDocTitle := string(Doc.nameProp);
131           FBodyText.Text := string(All.outerText);
132           FBodyHtml.Text := string(All.outerHTML);
133           IsSucceed := true;
134         end
135         else
136           DispatchMessage(Msg);
137         if IsStop then
138           Exit;
139       end;
140   except
141     Exit;
142   end;
143 end;
144 
145 function TUILess.Invoke(DispID: Integer; const IID: TGUID; LocaleID: Integer;
146   Flags: Word; var Params; VarResult, ExcepInfo, ArgErr: Pointer): HResult;
147 var
148   I: Integer;
149 begin
150   if DISPID_AMBIENT_DLCONTROL = DispID then
151   begin
152     I := DLCTL_DOWNLOADONLY + DLCTL_NO_SCRIPTS + DLCTL_NO_JAVA +
153       DLCTL_NO_DLACTIVEXCTLS + DLCTL_NO_RUNACTIVEXCTLS;
154     PVariant(VarResult)^ := I;
155     result := S_OK;
156   end
157   else
158     result := DISP_E_MEMBERNOTFOUND;
159 end;
160 
161 function TUILess.OnChanged(DispID: TDispID): HResult;
162 var
163   dp: TDispParams;
164   vResult: OleVariant;
165 begin
166   if (DISPID_READYSTATE = DispID) then
167     if SUCCEEDED((Doc as IhtmlDocument2).Invoke(DISPID_READYSTATE, GUID_null,
168         LOCALE_SYSTEM_DEFAULT, DISPATCH_PROPERTYGET, dp, @vResult, nil, nil))
169       then
170       if Integer(vResult) = READYSTATE_COMPLETE then
171         PostThreadMessage(GetCurrentThreadId(), WM_USER_STARTWALKING, 0, 0);
172 end;
173 
174 function TUILess.LoadUrlFromMoniker: HResult;
175 var
176   Moniker: IMoniker;
177   BindCtx: IBindCTX;
178   PM: IPersistMoniker;
179 begin
180   createURLMoniker(nil, _URL, Moniker);
181   CreateBindCtx(0, BindCtx);
182   PM := Doc as IPersistMoniker;
183   result := PM.Load(LongBool(0), Moniker, BindCtx, STGM_READ)
184 end;
185 
186 function TUILess.LoadUrlFromFile: HResult;
187 var
188   PF: IPersistfile;
189 begin
190   PF := Doc as IPersistfile;
191   result := PF.Load(_URL, 0);
192 end;
193 
194 // 获取图像链接
195 procedure TUILess.GetImageList(IC: IHTMLELEMENTCollection; ImageList: TStrings);
196 var
197   Image: IHTMLImgElement;
198   Disp: IDispatch;
199   x: Integer;
200 begin
201   if IC <> nil then
202   begin
203     for x := 0 to IC.Length - 1 do
204     begin
205       application.ProcessMessages;
206       Disp := IC.item(x, 0);
207       if SUCCEEDED(Disp.QueryInterface(IHTMLImgElement, Image)) then
208         ImageList.add(string(Image.src));
209     end;
210   end;
211 end;
212 
213 // 获取链接
214 procedure TUILess.GetAnchorList(IC: IHTMLELEMENTCollection;
215   Anchorlist: TStrings);
216 var
217   anchor: IHTMLAnchorElement;
218   Disp: IDispatch;
219   x: Integer;
220 begin
221   if IC <> nil then
222   begin
223     for x := 0 to IC.Length - 1 do
224     begin
225       application.ProcessMessages;
226       Disp := IC.item(x, 0);
227       if (SUCCEEDED(Disp.QueryInterface(IHTMLAnchorElement, anchor)) and
228           (anchor.href <> '')) then
229         Anchorlist.add(string(anchor.href));
230     end;
231   end;
232 end;
233 
234 /// Don't Care ------>>>>>>>>>>>
235 function TUILess.OnRequestEdit(DispID: TDispID): HResult;
236 begin
237   result := E_NOTIMPL;
238 end;
239 
240 function TUILess.SaveObject: HResult;
241 begin
242   result := E_NOTIMPL;
243 end;
244 
245 function TUILess.GetMoniker(dwAssign: Longint; dwWhichMoniker: Longint;
246   out mk: IMoniker): HResult;
247 begin
248   result := E_NOTIMPL;
249 end;
250 
251 function TUILess.GetContainer(out container: IOleContainer): HResult;
252 begin
253   result := E_NOTIMPL;
254 end;
255 
256 function TUILess.ShowObject: HResult;
257 begin
258   result := E_NOTIMPL;
259 end;
260 
261 function TUILess.OnShowWindow(fShow: BOOL): HResult;
262 begin
263   result := E_NOTIMPL;
264 end;
265 
266 function TUILess.RequestNewObjectLayout: HResult;
267 begin
268   result := E_NOTIMPL;
269 end;
270 
271 end.
View Code

 

Unit3.pas:

html网页采集_编程_02html网页采集_编程_03
  1 unit Unit3;
  2 
  3 interface
  4 
  5 uses
  6   Windows, Messages, SysUtils, Variants, Classes, Graphics, Controls, Forms,
  7   Dialogs, StdCtrls;
  8 
  9 type
 10   TForm3 = class(TForm)
 11     Button1: TButton;
 12     Edit1: TEdit;
 13     Memo1: TMemo;
 14     Button2: TButton;
 15     Button3: TButton;
 16     Button4: TButton;
 17     procedure Button1Click(Sender: TObject);
 18     procedure Button2Click(Sender: TObject);
 19     procedure Button3Click(Sender: TObject);
 20     procedure Button4Click(Sender: TObject);
 21   private
 22     { Private declarations }
 23     procedure into(i: Word);
 24   public
 25     { Public declarations }
 26   end;
 27 
 28 var
 29   Form3: TForm3;
 30 
 31 implementation
 32 uses UI_Less;
 33 
 34 {$R *.dfm}
 35 
 36 function DoStrToWideChar(s: string): PWideChar;
 37 var
 38  //   s:sting;
 39   pwc: PWidechar;
 40   len: integer;
 41 begin
 42   //  s:= 'abcdefg ';
 43   len := length(s) + 1;
 44   pwc := AllocMem(len * sizeof(widechar));
 45   stringtowidechar(s, pwc, len);
 46    // showmessage(widechartostring(pwc));
 47 
 48   result := pwc;
 49    //  FreeMem(pwc);
 50 end;
 51 
 52 
 53 
 54 procedure TForm3.into(i: Word);
 55 var
 56   sh: TUILess;
 57   su: boolean; // 是否获取成功
 58   // isstop: boolean; //设全局变量可以中断连接 ,避免出错
 59   surl: PWideChar;
 60 begin
 61   surl := DoStrToWideChar(Trim(Edit1.Text));
 62   sh := TUILess.Create(nil);
 63   try
 64     Memo1.Clear;
 65     case i of
 66       1:
 67         sh.GetAnchorList(sh.get(surl, su, False), Memo1.Lines);
 68       2:
 69         sh.GetImageList(sh.get(surl, su, False), Memo1.Lines);
 70       3:
 71         begin
 72           sh.get(surl, su, False);
 73           Memo1.Lines := sh.BodyText;
 74         end;
 75       4:
 76         begin
 77           sh.get(surl, su, False);
 78           Memo1.Lines := sh.BodyHtml;
 79         end;
 80     end;
 81   finally
 82     //sh.Free;
 83   end;
 84 end;
 85 
 86 procedure TForm3.Button1Click(Sender: TObject);
 87 begin
 88   into(1);
 89 end;
 90 
 91 procedure TForm3.Button2Click(Sender: TObject);
 92 begin
 93   into(2);
 94 end;
 95 
 96 procedure TForm3.Button3Click(Sender: TObject);
 97 begin
 98   into(3);
 99 end;
100 
101 procedure TForm3.Button4Click(Sender: TObject);
102 begin
103   into(4);
104 end;
105 
106 end.
View Code