html网页采集
UI_Less.pas:
1 unit UI_Less; 2 3 interface 4 5 uses 6 Windows, Classes, Messages, Forms, MsHtml, Urlmon, ActiveX; 7 8 const 9 WM_USER_STARTWALKING = WM_USER + 1; 10 DISPID_AMBIENT_DLCONTROL = (-5512); 11 READYSTATE_COMPLETE = $00000004; 12 13 DLCTL_DLIMAGES = $00000010; 14 DLCTL_VIDEOS = $00000020; 15 DLCTL_BGSOUNDS = $00000040; 16 DLCTL_NO_SCRIPTS = $00000080; 17 DLCTL_NO_JAVA = $00000100; 18 DLCTL_NO_RUNACTIVEXCTLS = $00000200; 19 DLCTL_NO_DLACTIVEXCTLS = $00000400; 20 DLCTL_DOWNLOADONLY = $00000800; 21 DLCTL_NO_FRAMEDOWNLOAD = $00001000; 22 DLCTL_RESYNCHRONIZE = $00002000; 23 DLCTL_PRAGMA_NO_CACHE = $00004000; 24 DLCTL_NO_BEHAVIORS = $00008000; 25 DLCTL_NO_METACHARSET = $00010000; 26 DLCTL_URL_ENCODING_DISABLE_UTF8 = $00020000; 27 DLCTL_URL_ENCODING_ENABLE_UTF8 = $00040000; 28 DLCTL_FORCEOFFLINE = $10000000; 29 DLCTL_NO_CLIENTPULL = $20000000; 30 DLCTL_SILENT = $40000000; 31 DLCTL_OFFLINEIFNOTCONNECTED = $80000000; 32 DLCTL_OFFLINE = DLCTL_OFFLINEIFNOTCONNECTED; 33 34 type 35 TUILess = class(TComponent, IUnknown, IDispatch, IPropertyNotifySink, 36 IOleClientSite) 37 private 38 FDocTitle: string; 39 FBodyText: TStrings; 40 FBodyHtml: TStrings; 41 protected 42 /// IDISPATCH 43 function Invoke(DispID: Integer; const IID: TGUID; LocaleID: Integer; 44 Flags: Word; var Params; VarResult, ExcepInfo, ArgErr: Pointer): HResult; 45 stdcall; 46 /// IPROPERTYNOTIFYSINK 47 function OnChanged(DispID: TDispID): HResult; stdcall; 48 function OnRequestEdit(DispID: TDispID): HResult; stdcall; 49 /// IOLECLIENTSITE 50 function SaveObject: HResult; stdcall; 51 function GetMoniker(dwAssign: Longint; dwWhichMoniker: Longint; 52 out mk: IMoniker): HResult; stdcall; 53 function GetContainer(out container: IOleContainer): HResult; stdcall; 54 function ShowObject: HResult; stdcall; 55 function OnShowWindow(fShow: BOOL): HResult; stdcall; 56 function RequestNewObjectLayout: HResult; stdcall; 57 /// 58 function LoadUrlFromMoniker: HResult; 59 function LoadUrlFromFile: HResult; 60 // * We only use LoadUrlFromMoniker, but we could use LoadUrlFromFile instead. 61 62 public 63 constructor Create(AOwner: TComponent); override; 64 destructor Destroy; override; 65 property DocTitle: string read FDocTitle; 66 property BodyText: TStrings read FBodyText write FBodyText; 67 property BodyHtml: TStrings read FBodyHtml write FBodyHtml; 68 function Get(URL: PWidechar; var IsSucceed: Boolean; IsStop: Boolean) 69 : IHTMLELEMENTCollection; 70 procedure GetAnchorList(IC: IHTMLELEMENTCollection; Anchorlist: TStrings); 71 procedure GetImageList(IC: IHTMLELEMENTCollection; ImageList: TStrings); 72 end; 73 74 implementation 75 76 var 77 Doc: IhtmlDocument2; 78 _URL: PWidechar; 79 80 constructor TUILess.Create(AOwner: TComponent); 81 begin 82 inherited Create(AOwner); 83 FBodyText := TStringList.Create; 84 FBodyHtml := TStringList.Create; 85 end; 86 87 destructor TUILess.Destroy; 88 begin 89 if Assigned(FBodyText) then 90 FBodyText.Free; 91 if Assigned(FBodyHtml) then 92 FBodyHtml.Free; 93 inherited Destroy; 94 end; 95 96 /// CORE ---->>>>>>>>> 97 function TUILess.Get(URL: PWidechar; var IsSucceed: Boolean; IsStop: Boolean) 98 : IHTMLELEMENTCollection; 99 var 100 Cookie: Integer; 101 CP: IConnectionPoint; 102 OleObject: IOleObject; 103 OleControl: IOleControl; 104 CPC: IConnectionPointContainer; 105 All: IHTMLElement; 106 Msg: TMsg; 107 hr: HResult; 108 begin 109 _URL := URL; 110 IsSucceed := false; 111 try 112 CoCreateInstance(CLASS_HTMLDocument, nil, CLSCTX_INPROC_SERVER, 113 IID_IHTMLDocument2, Doc); 114 OleObject := Doc as IOleObject; 115 OleObject.SetClientSite(self); 116 OleControl := Doc as IOleControl; 117 OleControl.OnAmbientPropertyChange(DISPID_AMBIENT_DLCONTROL); 118 CPC := Doc as IConnectionPointContainer; 119 CPC.FindConnectionPoint(IPropertyNotifySink, CP); 120 CP.Advise(self, Cookie); 121 hr := LoadUrlFromMoniker; // alternative: Hr:= LoadUrlFromFile; 122 if ((SUCCEEDED(hr)) or (hr = E_PENDING)) then 123 while (GetMessage(Msg, 0, 0, 0)) do 124 begin 125 if ((Msg.message = WM_USER_STARTWALKING) and (Msg.hwnd = 0)) then 126 begin 127 PostQuitMessage(0); 128 result := Doc.Get_all; 129 All := Doc.Get_body; 130 FDocTitle := string(Doc.nameProp); 131 FBodyText.Text := string(All.outerText); 132 FBodyHtml.Text := string(All.outerHTML); 133 IsSucceed := true; 134 end 135 else 136 DispatchMessage(Msg); 137 if IsStop then 138 Exit; 139 end; 140 except 141 Exit; 142 end; 143 end; 144 145 function TUILess.Invoke(DispID: Integer; const IID: TGUID; LocaleID: Integer; 146 Flags: Word; var Params; VarResult, ExcepInfo, ArgErr: Pointer): HResult; 147 var 148 I: Integer; 149 begin 150 if DISPID_AMBIENT_DLCONTROL = DispID then 151 begin 152 I := DLCTL_DOWNLOADONLY + DLCTL_NO_SCRIPTS + DLCTL_NO_JAVA + 153 DLCTL_NO_DLACTIVEXCTLS + DLCTL_NO_RUNACTIVEXCTLS; 154 PVariant(VarResult)^ := I; 155 result := S_OK; 156 end 157 else 158 result := DISP_E_MEMBERNOTFOUND; 159 end; 160 161 function TUILess.OnChanged(DispID: TDispID): HResult; 162 var 163 dp: TDispParams; 164 vResult: OleVariant; 165 begin 166 if (DISPID_READYSTATE = DispID) then 167 if SUCCEEDED((Doc as IhtmlDocument2).Invoke(DISPID_READYSTATE, GUID_null, 168 LOCALE_SYSTEM_DEFAULT, DISPATCH_PROPERTYGET, dp, @vResult, nil, nil)) 169 then 170 if Integer(vResult) = READYSTATE_COMPLETE then 171 PostThreadMessage(GetCurrentThreadId(), WM_USER_STARTWALKING, 0, 0); 172 end; 173 174 function TUILess.LoadUrlFromMoniker: HResult; 175 var 176 Moniker: IMoniker; 177 BindCtx: IBindCTX; 178 PM: IPersistMoniker; 179 begin 180 createURLMoniker(nil, _URL, Moniker); 181 CreateBindCtx(0, BindCtx); 182 PM := Doc as IPersistMoniker; 183 result := PM.Load(LongBool(0), Moniker, BindCtx, STGM_READ) 184 end; 185 186 function TUILess.LoadUrlFromFile: HResult; 187 var 188 PF: IPersistfile; 189 begin 190 PF := Doc as IPersistfile; 191 result := PF.Load(_URL, 0); 192 end; 193 194 // 获取图像链接 195 procedure TUILess.GetImageList(IC: IHTMLELEMENTCollection; ImageList: TStrings); 196 var 197 Image: IHTMLImgElement; 198 Disp: IDispatch; 199 x: Integer; 200 begin 201 if IC <> nil then 202 begin 203 for x := 0 to IC.Length - 1 do 204 begin 205 application.ProcessMessages; 206 Disp := IC.item(x, 0); 207 if SUCCEEDED(Disp.QueryInterface(IHTMLImgElement, Image)) then 208 ImageList.add(string(Image.src)); 209 end; 210 end; 211 end; 212 213 // 获取链接 214 procedure TUILess.GetAnchorList(IC: IHTMLELEMENTCollection; 215 Anchorlist: TStrings); 216 var 217 anchor: IHTMLAnchorElement; 218 Disp: IDispatch; 219 x: Integer; 220 begin 221 if IC <> nil then 222 begin 223 for x := 0 to IC.Length - 1 do 224 begin 225 application.ProcessMessages; 226 Disp := IC.item(x, 0); 227 if (SUCCEEDED(Disp.QueryInterface(IHTMLAnchorElement, anchor)) and 228 (anchor.href <> '')) then 229 Anchorlist.add(string(anchor.href)); 230 end; 231 end; 232 end; 233 234 /// Don't Care ------>>>>>>>>>>> 235 function TUILess.OnRequestEdit(DispID: TDispID): HResult; 236 begin 237 result := E_NOTIMPL; 238 end; 239 240 function TUILess.SaveObject: HResult; 241 begin 242 result := E_NOTIMPL; 243 end; 244 245 function TUILess.GetMoniker(dwAssign: Longint; dwWhichMoniker: Longint; 246 out mk: IMoniker): HResult; 247 begin 248 result := E_NOTIMPL; 249 end; 250 251 function TUILess.GetContainer(out container: IOleContainer): HResult; 252 begin 253 result := E_NOTIMPL; 254 end; 255 256 function TUILess.ShowObject: HResult; 257 begin 258 result := E_NOTIMPL; 259 end; 260 261 function TUILess.OnShowWindow(fShow: BOOL): HResult; 262 begin 263 result := E_NOTIMPL; 264 end; 265 266 function TUILess.RequestNewObjectLayout: HResult; 267 begin 268 result := E_NOTIMPL; 269 end; 270 271 end.
Unit3.pas:
1 unit Unit3; 2 3 interface 4 5 uses 6 Windows, Messages, SysUtils, Variants, Classes, Graphics, Controls, Forms, 7 Dialogs, StdCtrls; 8 9 type 10 TForm3 = class(TForm) 11 Button1: TButton; 12 Edit1: TEdit; 13 Memo1: TMemo; 14 Button2: TButton; 15 Button3: TButton; 16 Button4: TButton; 17 procedure Button1Click(Sender: TObject); 18 procedure Button2Click(Sender: TObject); 19 procedure Button3Click(Sender: TObject); 20 procedure Button4Click(Sender: TObject); 21 private 22 { Private declarations } 23 procedure into(i: Word); 24 public 25 { Public declarations } 26 end; 27 28 var 29 Form3: TForm3; 30 31 implementation 32 uses UI_Less; 33 34 {$R *.dfm} 35 36 function DoStrToWideChar(s: string): PWideChar; 37 var 38 // s:sting; 39 pwc: PWidechar; 40 len: integer; 41 begin 42 // s:= 'abcdefg '; 43 len := length(s) + 1; 44 pwc := AllocMem(len * sizeof(widechar)); 45 stringtowidechar(s, pwc, len); 46 // showmessage(widechartostring(pwc)); 47 48 result := pwc; 49 // FreeMem(pwc); 50 end; 51 52 53 54 procedure TForm3.into(i: Word); 55 var 56 sh: TUILess; 57 su: boolean; // 是否获取成功 58 // isstop: boolean; //设全局变量可以中断连接 ,避免出错 59 surl: PWideChar; 60 begin 61 surl := DoStrToWideChar(Trim(Edit1.Text)); 62 sh := TUILess.Create(nil); 63 try 64 Memo1.Clear; 65 case i of 66 1: 67 sh.GetAnchorList(sh.get(surl, su, False), Memo1.Lines); 68 2: 69 sh.GetImageList(sh.get(surl, su, False), Memo1.Lines); 70 3: 71 begin 72 sh.get(surl, su, False); 73 Memo1.Lines := sh.BodyText; 74 end; 75 4: 76 begin 77 sh.get(surl, su, False); 78 Memo1.Lines := sh.BodyHtml; 79 end; 80 end; 81 finally 82 //sh.Free; 83 end; 84 end; 85 86 procedure TForm3.Button1Click(Sender: TObject); 87 begin 88 into(1); 89 end; 90 91 procedure TForm3.Button2Click(Sender: TObject); 92 begin 93 into(2); 94 end; 95 96 procedure TForm3.Button3Click(Sender: TObject); 97 begin 98 into(3); 99 end; 100 101 procedure TForm3.Button4Click(Sender: TObject); 102 begin 103 into(4); 104 end; 105 106 end.