Watch, Follow, &
Connect with Us

For forums, blogs and more please visit our
Developer Tools Community.


ID: 21937, Parsing HTML

by Simon Carter Email: Anonymous


The following code demonstrates how to parse a html file looking for

Begin Tag / End Tag / Raw Text
Download Details
FTP  download also available
CDN Login Required to Download. (You will be redirected to the login page if you click on the Download Link)
To download this, you must have registered:
A free membership

For Delphi, Version 2.0  to 7.0 479 downloads
Copyright: All rights reserved


Size: 12,170 bytes
Updated on Wed, 07 Jul 2004 02:44:11 GMT
Originally uploaded on Wed, 07 Jul 2004 02:40:42 GMT
SHA1 Hash: 802B509CE11D79844195EEFE75AE24067AE770A5
MD5 Hash: B6A607FCA13E1A7E75724DC9744A85CD

    Explore the files in this upload

Description
The following routine demonstrates how to parse a html file.

I welcome feed back to improve the routine, if you have any suggestions/hints please let me know.

rgds

Si Carter


---------------- BEGIN CODE BLOCK ------------------------

unit HTMLParse;
(***************************************************************************

HTMLParse

Purpose: Parse a html file to extract tags and plain text.

Copyright © 2003 - TECT Software Ltd. All Rights Reserved.

All code remains the property of TECT Software Ltd and may not
be changed without permission. Use of this code is granted to
any developer for private, open source or commercial applications.

No warranty expressed or implied. Use at own risk.


Contact:
WEB - www.tectsoft.net
EMail - simon.carter@tectsoft.com

Copyright Notice Must Remain With File.

Visit www.tectsoft.net for *low cost* developer friendly web hosting.

Requires:
FastStrings from http://www.droopyeyes.com

Usage:
See Demo File.

****************************************************************************)

interface

uses Classes, FastStringFuncs, FastStrings;

type
TTagType = (ttBeginTag, ttEndTag, ttRawText);
THTMLParseProc = procedure(const HTMLData: string; TagType: TTagType;
Parameters: TStrings);

procedure ParseHTML(const HTML: string; ParseProc: THTMLParseProc);

implementation

uses SysUtils;

const
(* NOTE: download the file below, the following codes are wrong when
displayed in a browser like this :-) *)
THTMLReplaceWords: array[0..4] of array[0..1] of string = ((' ', ' '),
('&', '&'), ('<', '<'), ('>', '>'), ('"', '"'));

procedure ParseHTML(const HTML: string; ParseProc: THTMLParseProc);

procedure CallTagProc(IsTag: Boolean; HTMLData: string);
var
s: string;
sl: TStringList;
I: Integer;
begin
HTMLData := Trim(HTMLData);
if Length(HTMLData) > 0 then
begin
if IsTag then
begin
if Pos(' ', HTMLData) > 0 then
s := Trim(Copy(HTMLData, 1, Pos(' ', HTMLData)))
else
s := Trim(HTMLData);

sl := TStringList.Create;
try
sl.Text := Trim(Copy(HTMLData, Length(s) + 1, length(HTMLData)));
sl.Text := Trim(FastReplace(sl.Text, ';', #13));
sl.Text := Trim(FastReplace(sl.Text, '" ', #13));
sl.Text := Trim(FastReplace(sl.Text, '"', ''));

if LeftStr(s, 1) = '/' then
THTMLParseProc(ParseProc)(uppercase(s), ttEndTag, sl)
else
THTMLParseProc(ParseProc)(UpperCase(s), ttBeginTag, sl);
finally
sl.Free;
end;
end else
begin
for I := 0 to 4 do
HTMLData := FastReplace(HTMLData, THTMLReplaceWords[I, 0],
THTMLReplaceWords[I, 1]);

THTMLParseProc(ParseProc)(HTMLData, ttRawText, nil);
end;
end;
end;

var
s: string;
P: PChar;
begin
Assert(Assigned(ParseProc));
P := PChar(HTML);
s := '';

while P^ <> #0 do
begin
case P^ of
'<':
begin
CallTagProc(False, s);
s := '';
end;
'>':
begin
CallTagProc(True, s);
s := '';
end;
else
s := s + P^;
end; //case
Inc(P);
end;
end;

end.


---------------- END CODE BLOCK ------------------------

For more information, see http://www.tectsoft.net/

   Latest Comments  View All Add New

Move mouse over comment to see the full text

Could not retrieve comments. Please try again later.

Server Response from: ETNACDC03