René Nyffenegger's collection of things on the web
René Nyffenegger on Oracle - Most wanted - Feedback -
 

OracleText with files to be indexed on Harddisk

A table that has the file names of the documents associated with an ID for later reference.
set feedback off

create table the_docs (
  id        number primary key,
  file_name Varchar2(300)
);

begin
  ctx_ddl.create_preference('pref_datastore','FILE_DATASTORE');
end;
/
A perl script to find all files that need to be indexed:
use strict;
use warnings;

use File::Find;

open OUT, ">insert_docs_to_index.sql" or die "oh no";
my $pk=0;

find(\&d, 'f:\some_docs');

sub d {

  my $n = $File::Find::name;

  if (-f $n) {
    return if $n =~ /zip$/i;
    return if $n =~ /gif$/i;
    return if $n =~ /jpg$/i;
    $n =~ s./.\\.g;

    print OUT "insert into the_docs values($pk, '$n');\n";
    $pk++;
  }

}
Loading the data that the perl script generated.
@insert_docs_to_index
commit;
Creating the index:
create index ix_the_docs on the_docs(file_name)
  indextype is ctxsys.context
  parameters('datastore pref_datastore');

Workouround for ctx_doc.filter's slowliness

ctx_doc.filter is notoriously slow. Workaround:
create table doc_content (
  id   number,
  pos  number,
  piece Varchar2(300),
  constraint pk_doc_content primary key (id, pos)
);

create table doc_tokens (
  query_id  number,
  token     Varchar2(64),
  offset    number,
  length    number
); 

declare
  v_c clob;
begin

  for r in (select id from the_docs order by id) loop
    begin

    ctx_doc.filter(
      index_name => 'IX_THE_DOCS',
      textkey    => to_char(r.id),
      restab     => v_c,
      plaintext  => true);

    for p in 0 .. trunc(length(v_c) / 300) loop

      insert into doc_content(id, pos, piece) values
        (r.id, p, substr(v_c,(p)*300,300));

    end loop;
    exception
      when others then
        dbms_output.put_line('Exception filling doc_content for id: ' || r.id);
    end;

  end loop;
end;
/


begin
  for r in (select id from the_docs order by id) loop
    begin
      ctx_doc.tokens(
        index_name=>'IX_THE_DOCS',
        textkey   =>to_char(r.id),
        restab    =>'doc_tokens',
        query_id  =>r.id);
    exception
        when others then
          dbms_output.put_line('Exception filling doc_tokens for id: ' || r.id);
    end;
  end loop;
end;
/

alter table doc_tokens add constraint pk_the_docs primary key (query_id, token, offset);

@doc_types
@doc_search
A procedure that does the search
create or replace package search_doc is
  type rc is ref cursor;
  procedure search(
    p_q    in  Varchar2,
    p_with_highlight in boolean,
    result out rc
  );

  c_length_teaser constant number := 60;
end search_doc;
/


create or replace package body search_doc as

  procedure search(
    p_q              in  Varchar2,
    p_with_highlight in boolean,
    result          out rc
  ) is

    v_file  the_docs.file_name%type;
    v_dummy number;

    v_highlight_tab ctx_doc.highlight_tab;
    v_result_lines  result_line_tab:=result_line_tab();
    v_pos_begin    number;
    v_pos_end      number;
    v_offset_begin number;
    v_offset_end   number;
    v_token        Varchar2(64);

    v_piece        Varchar2(300);

    v_ws           number;

    --v_dummy_   Varchar(40);

  begin
  
    for r in (select /*+ first_rows */  rownum rn, score(1) sc , id , file_name 
         from the_docs where contains(file_name,p_q,1) > 0 order by score(1) desc) loop 
      exit when r.rn > 10;

      v_result_lines.extend;

      v_ws := instr(p_q,' ');

      if v_ws > 0 then
        v_token := substr(p_q,1,v_ws-1);
      else
        v_token := p_q;
      end if;


      dbms_output.put_line('v_token: ' || v_token || ', r.id: ' || r.id);

      begin
        if p_with_highlight then

          ctx_doc.highlight(
            index_name => 'IX_SNB_DOCS',
            textkey    => to_char(r.id),
            text_query => p_q,
            restab     => v_highlight_tab,
            plaintext  => true);
    
--          for i in 1 .. v_highlight_tab.count loop
--            exit when i > 1;

          v_offset_begin := v_highlight_tab(1).offset-                          c_length_teaser;
          v_offset_end   := v_highlight_tab(1).offset+v_highlight_tab(1).length+c_length_teaser; 

          --v_dummy_ := 'split';
        else
 --           dbms_output.put_line('v_offset_begin: ' || v_offset_begin || ', v_offset_end: ' || v_offset_end || ', v_pos_begin: ' || v_pos_begin || ', v_pos_end: ' || v_pos_end);

          --v_dummy_ := 'no split (' || r.id || ') ';

          select /*+ firstrows index(doc_tokens) */ offset - c_length_teaser, offset + length + c_length_teaser into v_offset_begin, v_offset_end from doc_tokens
          where token=upper(v_token) and query_id = to_char(r.id) and rownum = 1;

        end if; -- p_with_highlight

        v_pos_begin := trunc(v_offset_begin / 300);
        v_pos_end   := trunc(v_offset_end   / 300);

        if v_pos_begin < v_pos_end then
          declare
            v_piece_1 Varchar2(300);
            v_piece_2 Varchar2(300);
          begin
            select substr(piece,v_offset_begin-300*v_pos_begin) into v_piece_1
            from doc_content where id = r.id and pos=v_pos_begin;

            select substr(piece,1, v_offset_end-300*v_pos_end) into v_piece_2
            from doc_content where id = r.id and pos=v_pos_end;

            v_piece := v_piece_1 || v_piece_2;

          end;

        else

          select substr(piece,v_offset_begin-300*v_pos_begin, (v_offset_end-300*v_pos_begin) - (v_offset_begin-300*v_pos_begin)) into v_piece 
          from doc_content where id = r.id and pos=v_pos_begin;

        end if;
        v_result_lines(r.rn) :=  result_line(r.file_name,v_piece,c_length_teaser, v_offset_end-v_offset_begin-2*c_length_teaser+1);

--        end loop;
      exception
        when no_data_found then
        v_result_lines(r.rn) := result_line(r.file_name,'- -', 0, 0);

      end;
    end loop;

    open result for select * from table(cast (v_result_lines as result_line_tab));

  end search;
end search_doc;
/