我需要在Delphi 2010中实现一个IFilter,该IFilter可以搜索Office 2007 docx文件并返回文档中找到的文本。

ifilter还需要使用IPersistStream接口。

谢谢

最佳答案

您不想实现IFilter来解析Office 2007 docx。您想使用Microsoft's already written IFilter objects,以便您可以学习docx文件的内容。

然后,您使用标准的IFilter机制来解析文件内容:

procedure TForm1.ProcessFile(filename: string);
var
    Filter: IFilter;
    hr: HRESULT;
    chunk: PSTAT_CHUNK;
//  attr: FULLPROPSPEC;
    flags: ULONG;
    c: Cardinal;
    buffer: WideString;
begin
    Log('Processing "'+filename+'"');

    Log('Calling LoadIFilter');
    filter := LoadIFilter(filename);
    if filter = nil then
    begin
        Log('filter is null; leaving');
        Exit;
    end;
    try
        Log('Calling filter.Init(IFILTER_INIT_INDEXING_ONLY)');
        hr := filter.Init(IFILTER_INIT_INDEXING_ONLY, 0, nil, flags);
        OleCheck(hr);

        Log('Init returned sucessfully, looking for chunks...');
        while True do
        begin
            New(chunk);
            try
                hr := filter.GetChunk(chunk);
                if Failed(hr) then
                begin
                    Log('No more chunks: '+IntToHex(hr, 8)+' ('+GetChunkHresultToStr(hr)+')');
                    Break;
                end;

                Log('== Got chunk. ChunkType='+IntToStr(chunk.flags)+' (1=text, 2=value) ==');

                if (chunk.Flags and CHUNK_TEXT) = CHUNK_TEXT then
                begin
                    c := 2048;
                    SetLength(buffer, c);
                    hr := filter.GetText(c, PWideChar(buffer));
                    if Succeeded(hr) then
                    begin
                        Log('=== Got text ===');
                        SetLength(buffer, c);
                        Log(buffer);

                        while Succeeded(hr) do
                        begin
                            c := 2048;
                            SetLength(buffer, c);
                            hr := filter.GetText(c, PWideChar(buffer));
                            if Succeeded(hr) then
                            begin
                                SetLength(buffer, c);
                                Log('==== Really long chunk, here''s the next 2048 characters ====');
                                Log(buffer);
                            end;
                        end;
                    end
                    else
                    begin
                        Log('Could not get text from chunk: '+IntToHex(hr, 8)+' ('+GetChunkHResultToStr(hr)+')');
                        Log('   It might be a "Value" chunk, meaning i should call filter.GetValue rather than filter.GetText. But i''m too lazy');
                    end;
                end
                else if (chunk.flags and CHUNK_VALUE) = CHUNK_VALUE then
                begin
                    Log('This is a "VALUE" chunk. i''m not going to read anything out of it cause it''s too hard :(');
                end
                else
                    Log('Unknown chunk type');
            finally
            Dispose(chunk);
            end;
        end; //end while true getting chunks
    finally
        filter := nil;
    end;
end;


Windows已提供为指定文件名加载IFilter的代码的位置:

function TForm1.LoadIFilter(const filename: WideString): IFilter;
var
    hr: HRESULT;
    unk: IUnknown;
begin
    hr := ntQuery.LoadIFilter(PWideChar(filename), nil, unk);
    OleCheck(hr);

    Result := unk as IFilter;
end;


IFilter声明单元:

unit Filter;

interface

uses
  Windows, SysUtils, Classes, ActiveX;

type
    IFILTER_INIT = TOleEnum;
const
    IFILTER_INIT_CANON_PARAGRAPHS             = 1;
    IFILTER_INIT_HARD_LINE_BREAKS             = 2;
    IFILTER_INIT_CANON_HYPHENS                = 4;
    IFILTER_INIT_CANON_SPACES                 = 8;
    IFILTER_INIT_APPLY_INDEX_ATTRIBUTES   = 16;
    IFILTER_INIT_APPLY_OTHER_ATTRIBUTES   = 32;
    IFILTER_INIT_INDEXING_ONLY                = 64;
    IFILTER_INIT_SEARCH_LINKS                 = 128;

type
    IFILTER_FLAGS = TOleEnum;
const
    IFILTER_FLAGS_OLE_PROPERTIES = 1;

type
    CHUNKSTATE = TOleEnum;
const
    CHUNK_TEXT =    $01;
    CHUNK_VALUE =   $02;

type
    CHUNK_BREAKTYPE = TOleEnum;
const
    CHUNK_NO_BREAK =    0;
    CHUNK_EOW =         1;
    CHUNK_EOS =         2;
    CHUNK_EOP =         3;
    CHUNK_EOC =         4;

type
    FILTERREGION = packed record
        idChunk: ULONG;
        cwcStart: ULONG;
        cwcExtent: ULONG;
    end;
    tagFILTERREGION = FILTERREGION;


const
    PRSPEC_LPWSTR =     0;
    PRSPEC_PROPID =     1;

type
    PROPID = ULONG;

type
    PROPSPEC = packed record
        ulKind: ULONG;
        case integer of
        0: (prid: PROPID);
        1: (lpws: PWideChar);
    end;
    tagPROPSPEC = PROPSPEC;

type
    FULLPROPSPEC = packed record
        guidPropSet: TGUID;
        psProperty: PROPSPEC;
    end;
    tagFULLPROPSPEC =   FULLPROPSPEC;
    PFULLPROPSPEC =         ^FULLPROPSPEC;

type
    STAT_CHUNK = packed record
        idChunk: ULONG;
        breakType: CHUNK_BREAKTYPE;
        flags: CHUNKSTATE;
        locale: LCID;
        attribute: FULLPROPSPEC;
        idChunkSource: ULONG;
        cwcStartSource: ULONG;
        cwcLenSource: ULONG;
    end;
    tagSTAT_CHUNK =     STAT_CHUNK;
    PSTAT_CHUNK =       ^STAT_CHUNK;

// From filtererr.h
const
    FILTER_E_END_OF_CHUNKS = HRESULT($80041700);

//
// MessageId: FILTER_E_NO_MORE_TEXT
//
// MessageText:
//
//  No more text available in chunk.
//
const
    FILTER_E_NO_MORE_TEXT = HRESULT($80041701);

//
// MessageId: FILTER_E_NO_MORE_VALUES
//
// MessageText:
//
//  No more property values available in chunk.
//
const
    FILTER_E_NO_MORE_VALUES = HRESULT($80041702);

//
// MessageId: FILTER_E_ACCESS
//
// MessageText:
//
//  Unable to access object.
//
const
    FILTER_E_ACCESS = HRESULT($80041703);

//
// MessageId: FILTER_W_MONIKER_CLIPPED
//
// MessageText:
//
//  Moniker doesn't cover entire region.
//
const
    FILTER_W_MONIKER_CLIPPED = HRESULT($80041704);

//
// MessageId: FILTER_E_NO_TEXT
//
// MessageText:
//
//  No text in current chunk.
//
const
    FILTER_E_NO_TEXT = HRESULT($80041705);

//
// MessageId: FILTER_E_NO_VALUES
//
// MessageText:
//
//  No values in current chunk.
//
const
    FILTER_E_NO_VALUES = HRESULT($80041706);

//
// MessageId: FILTER_E_EMBEDDING_UNAVAILABLE
//
// MessageText:
//
//  Unable to bind IFilter for embedded object.
//
const
    FILTER_E_EMBEDDING_UNAVAILABLE = HRESULT($80041707);

//
// MessageId: FILTER_E_LINK_UNAVAILABLE
//
// MessageText:
//
//  Unable to bind IFilter for linked object.
//
const
    FILTER_E_LINK_UNAVAILABLE             =  HRESULT($80041708);

//
// MessageId: FILTER_S_LAST_TEXT
//
// MessageText:
//
//  This is the last text in the current chunk.
//
const
    FILTER_S_LAST_TEXT = HRESULT($00041709);

//
// MessageId: FILTER_S_LAST_VALUES
//
// MessageText:
//
//  This is the last value in the current chunk.
//
const
    FILTER_S_LAST_VALUES = HRESULT($0004170A);

//
// MessageId: FILTER_E_PASSWORD
//
// MessageText:
//
//  File was not filtered due to password protection.
//
const
    FILTER_E_PASSWORD = HRESULT($8004170B);

//
// MessageId: FILTER_E_UNKNOWNFORMAT
//
// MessageText:
//
//  The document format is not recognized by the flter.
//
const
    FILTER_E_UNKNOWNFORMAT = HRESULT($8004170C);


const
    IID_IFilter: TGUID = '{89BCB740-6119-101A-BCB7-00DD010655AF}';

type
    IFilter = interface(IUnknown)
        ['{89BCB740-6119-101A-BCB7-00DD010655AF}']
        function Init(grfFlags: ULONG; cAttributes: ULONG; aAttributes: PFULLPROPSPEC; out pFlags: ULONG): HResult; stdcall;
        function GetChunk(pStat: PSTAT_CHUNK): HResult; stdcall;
        function GetText(var pcwcBuffer: ULONG; awcBuffer: PWideChar): HResult; stdcall;
        function GetValue(out ppPropValue: PROPVARIANT): HResult; stdcall;
        function BindRegion(origPos: FILTERREGION; riid: TGUID; out ppUnk): HResult; stdcall;
    end;

implementation

end.

10-08 14:01