我需要在Delphi 2010中实现一个IFilter,该IFilter可以搜索Office 2007 docx文件并返回文档中找到的文本。
ifilter还需要使用IPersistStream接口。
谢谢
最佳答案
您不想实现IFilter
来解析Office 2007 docx。您想使用Microsoft's already written IFilter
objects,以便您可以学习docx
文件的内容。
然后,您使用标准的IFilter
机制来解析文件内容:
procedure TForm1.ProcessFile(filename: string);
var
Filter: IFilter;
hr: HRESULT;
chunk: PSTAT_CHUNK;
// attr: FULLPROPSPEC;
flags: ULONG;
c: Cardinal;
buffer: WideString;
begin
Log('Processing "'+filename+'"');
Log('Calling LoadIFilter');
filter := LoadIFilter(filename);
if filter = nil then
begin
Log('filter is null; leaving');
Exit;
end;
try
Log('Calling filter.Init(IFILTER_INIT_INDEXING_ONLY)');
hr := filter.Init(IFILTER_INIT_INDEXING_ONLY, 0, nil, flags);
OleCheck(hr);
Log('Init returned sucessfully, looking for chunks...');
while True do
begin
New(chunk);
try
hr := filter.GetChunk(chunk);
if Failed(hr) then
begin
Log('No more chunks: '+IntToHex(hr, 8)+' ('+GetChunkHresultToStr(hr)+')');
Break;
end;
Log('== Got chunk. ChunkType='+IntToStr(chunk.flags)+' (1=text, 2=value) ==');
if (chunk.Flags and CHUNK_TEXT) = CHUNK_TEXT then
begin
c := 2048;
SetLength(buffer, c);
hr := filter.GetText(c, PWideChar(buffer));
if Succeeded(hr) then
begin
Log('=== Got text ===');
SetLength(buffer, c);
Log(buffer);
while Succeeded(hr) do
begin
c := 2048;
SetLength(buffer, c);
hr := filter.GetText(c, PWideChar(buffer));
if Succeeded(hr) then
begin
SetLength(buffer, c);
Log('==== Really long chunk, here''s the next 2048 characters ====');
Log(buffer);
end;
end;
end
else
begin
Log('Could not get text from chunk: '+IntToHex(hr, 8)+' ('+GetChunkHResultToStr(hr)+')');
Log(' It might be a "Value" chunk, meaning i should call filter.GetValue rather than filter.GetText. But i''m too lazy');
end;
end
else if (chunk.flags and CHUNK_VALUE) = CHUNK_VALUE then
begin
Log('This is a "VALUE" chunk. i''m not going to read anything out of it cause it''s too hard :(');
end
else
Log('Unknown chunk type');
finally
Dispose(chunk);
end;
end; //end while true getting chunks
finally
filter := nil;
end;
end;
Windows已提供为指定文件名加载
IFilter
的代码的位置:function TForm1.LoadIFilter(const filename: WideString): IFilter;
var
hr: HRESULT;
unk: IUnknown;
begin
hr := ntQuery.LoadIFilter(PWideChar(filename), nil, unk);
OleCheck(hr);
Result := unk as IFilter;
end;
IFilter声明单元:
unit Filter;
interface
uses
Windows, SysUtils, Classes, ActiveX;
type
IFILTER_INIT = TOleEnum;
const
IFILTER_INIT_CANON_PARAGRAPHS = 1;
IFILTER_INIT_HARD_LINE_BREAKS = 2;
IFILTER_INIT_CANON_HYPHENS = 4;
IFILTER_INIT_CANON_SPACES = 8;
IFILTER_INIT_APPLY_INDEX_ATTRIBUTES = 16;
IFILTER_INIT_APPLY_OTHER_ATTRIBUTES = 32;
IFILTER_INIT_INDEXING_ONLY = 64;
IFILTER_INIT_SEARCH_LINKS = 128;
type
IFILTER_FLAGS = TOleEnum;
const
IFILTER_FLAGS_OLE_PROPERTIES = 1;
type
CHUNKSTATE = TOleEnum;
const
CHUNK_TEXT = $01;
CHUNK_VALUE = $02;
type
CHUNK_BREAKTYPE = TOleEnum;
const
CHUNK_NO_BREAK = 0;
CHUNK_EOW = 1;
CHUNK_EOS = 2;
CHUNK_EOP = 3;
CHUNK_EOC = 4;
type
FILTERREGION = packed record
idChunk: ULONG;
cwcStart: ULONG;
cwcExtent: ULONG;
end;
tagFILTERREGION = FILTERREGION;
const
PRSPEC_LPWSTR = 0;
PRSPEC_PROPID = 1;
type
PROPID = ULONG;
type
PROPSPEC = packed record
ulKind: ULONG;
case integer of
0: (prid: PROPID);
1: (lpws: PWideChar);
end;
tagPROPSPEC = PROPSPEC;
type
FULLPROPSPEC = packed record
guidPropSet: TGUID;
psProperty: PROPSPEC;
end;
tagFULLPROPSPEC = FULLPROPSPEC;
PFULLPROPSPEC = ^FULLPROPSPEC;
type
STAT_CHUNK = packed record
idChunk: ULONG;
breakType: CHUNK_BREAKTYPE;
flags: CHUNKSTATE;
locale: LCID;
attribute: FULLPROPSPEC;
idChunkSource: ULONG;
cwcStartSource: ULONG;
cwcLenSource: ULONG;
end;
tagSTAT_CHUNK = STAT_CHUNK;
PSTAT_CHUNK = ^STAT_CHUNK;
// From filtererr.h
const
FILTER_E_END_OF_CHUNKS = HRESULT($80041700);
//
// MessageId: FILTER_E_NO_MORE_TEXT
//
// MessageText:
//
// No more text available in chunk.
//
const
FILTER_E_NO_MORE_TEXT = HRESULT($80041701);
//
// MessageId: FILTER_E_NO_MORE_VALUES
//
// MessageText:
//
// No more property values available in chunk.
//
const
FILTER_E_NO_MORE_VALUES = HRESULT($80041702);
//
// MessageId: FILTER_E_ACCESS
//
// MessageText:
//
// Unable to access object.
//
const
FILTER_E_ACCESS = HRESULT($80041703);
//
// MessageId: FILTER_W_MONIKER_CLIPPED
//
// MessageText:
//
// Moniker doesn't cover entire region.
//
const
FILTER_W_MONIKER_CLIPPED = HRESULT($80041704);
//
// MessageId: FILTER_E_NO_TEXT
//
// MessageText:
//
// No text in current chunk.
//
const
FILTER_E_NO_TEXT = HRESULT($80041705);
//
// MessageId: FILTER_E_NO_VALUES
//
// MessageText:
//
// No values in current chunk.
//
const
FILTER_E_NO_VALUES = HRESULT($80041706);
//
// MessageId: FILTER_E_EMBEDDING_UNAVAILABLE
//
// MessageText:
//
// Unable to bind IFilter for embedded object.
//
const
FILTER_E_EMBEDDING_UNAVAILABLE = HRESULT($80041707);
//
// MessageId: FILTER_E_LINK_UNAVAILABLE
//
// MessageText:
//
// Unable to bind IFilter for linked object.
//
const
FILTER_E_LINK_UNAVAILABLE = HRESULT($80041708);
//
// MessageId: FILTER_S_LAST_TEXT
//
// MessageText:
//
// This is the last text in the current chunk.
//
const
FILTER_S_LAST_TEXT = HRESULT($00041709);
//
// MessageId: FILTER_S_LAST_VALUES
//
// MessageText:
//
// This is the last value in the current chunk.
//
const
FILTER_S_LAST_VALUES = HRESULT($0004170A);
//
// MessageId: FILTER_E_PASSWORD
//
// MessageText:
//
// File was not filtered due to password protection.
//
const
FILTER_E_PASSWORD = HRESULT($8004170B);
//
// MessageId: FILTER_E_UNKNOWNFORMAT
//
// MessageText:
//
// The document format is not recognized by the flter.
//
const
FILTER_E_UNKNOWNFORMAT = HRESULT($8004170C);
const
IID_IFilter: TGUID = '{89BCB740-6119-101A-BCB7-00DD010655AF}';
type
IFilter = interface(IUnknown)
['{89BCB740-6119-101A-BCB7-00DD010655AF}']
function Init(grfFlags: ULONG; cAttributes: ULONG; aAttributes: PFULLPROPSPEC; out pFlags: ULONG): HResult; stdcall;
function GetChunk(pStat: PSTAT_CHUNK): HResult; stdcall;
function GetText(var pcwcBuffer: ULONG; awcBuffer: PWideChar): HResult; stdcall;
function GetValue(out ppPropValue: PROPVARIANT): HResult; stdcall;
function BindRegion(origPos: FILTERREGION; riid: TGUID; out ppUnk): HResult; stdcall;
end;
implementation
end.