René Nyffenegger's collection of things on the web | |
René Nyffenegger on Oracle - Most wanted - Feedback
- Follow @renenyffenegger
|
OracleText with files to be indexed on Harddisk | ||
A table that has the file names of the documents associated with an ID for later reference.
set feedback off create table the_docs ( id number primary key, file_name Varchar2(300) ); begin ctx_ddl.create_preference('pref_datastore','FILE_DATASTORE'); end; /
A perl script to find all files that need to be indexed:
use strict; use warnings; use File::Find; open OUT, ">insert_docs_to_index.sql" or die "oh no"; my $pk=0; find(\&d, 'f:\some_docs'); sub d { my $n = $File::Find::name; if (-f $n) { return if $n =~ /zip$/i; return if $n =~ /gif$/i; return if $n =~ /jpg$/i; $n =~ s./.\\.g; print OUT "insert into the_docs values($pk, '$n');\n"; $pk++; } }
Loading the data that the perl script generated.
@insert_docs_to_index commit;
Creating the index:
create index ix_the_docs on the_docs(file_name) indextype is ctxsys.context parameters('datastore pref_datastore'); Workouround for ctx_doc.filter's slowliness
ctx_doc.filter is notoriously slow. Workaround:
create table doc_content ( id number, pos number, piece Varchar2(300), constraint pk_doc_content primary key (id, pos) ); create table doc_tokens ( query_id number, token Varchar2(64), offset number, length number ); declare v_c clob; begin for r in (select id from the_docs order by id) loop begin ctx_doc.filter( index_name => 'IX_THE_DOCS', textkey => to_char(r.id), restab => v_c, plaintext => true); for p in 0 .. trunc(length(v_c) / 300) loop insert into doc_content(id, pos, piece) values (r.id, p, substr(v_c,(p)*300,300)); end loop; exception when others then dbms_output.put_line('Exception filling doc_content for id: ' || r.id); end; end loop; end; / begin for r in (select id from the_docs order by id) loop begin ctx_doc.tokens( index_name=>'IX_THE_DOCS', textkey =>to_char(r.id), restab =>'doc_tokens', query_id =>r.id); exception when others then dbms_output.put_line('Exception filling doc_tokens for id: ' || r.id); end; end loop; end; / alter table doc_tokens add constraint pk_the_docs primary key (query_id, token, offset); @doc_types @doc_search
A procedure that does the search
create or replace package search_doc is type rc is ref cursor; procedure search( p_q in Varchar2, p_with_highlight in boolean, result out rc ); c_length_teaser constant number := 60; end search_doc; / create or replace package body search_doc as procedure search( p_q in Varchar2, p_with_highlight in boolean, result out rc ) is v_file the_docs.file_name%type; v_dummy number; v_highlight_tab ctx_doc.highlight_tab; v_result_lines result_line_tab:=result_line_tab(); v_pos_begin number; v_pos_end number; v_offset_begin number; v_offset_end number; v_token Varchar2(64); v_piece Varchar2(300); v_ws number; --v_dummy_ Varchar(40); begin for r in (select /*+ first_rows */ rownum rn, score(1) sc , id , file_name from the_docs where contains(file_name,p_q,1) > 0 order by score(1) desc) loop exit when r.rn > 10; v_result_lines.extend; v_ws := instr(p_q,' '); if v_ws > 0 then v_token := substr(p_q,1,v_ws-1); else v_token := p_q; end if; dbms_output.put_line('v_token: ' || v_token || ', r.id: ' || r.id); begin if p_with_highlight then ctx_doc.highlight( index_name => 'IX_SNB_DOCS', textkey => to_char(r.id), text_query => p_q, restab => v_highlight_tab, plaintext => true); -- for i in 1 .. v_highlight_tab.count loop -- exit when i > 1; v_offset_begin := v_highlight_tab(1).offset- c_length_teaser; v_offset_end := v_highlight_tab(1).offset+v_highlight_tab(1).length+c_length_teaser; --v_dummy_ := 'split'; else -- dbms_output.put_line('v_offset_begin: ' || v_offset_begin || ', v_offset_end: ' || v_offset_end || ', v_pos_begin: ' || v_pos_begin || ', v_pos_end: ' || v_pos_end); --v_dummy_ := 'no split (' || r.id || ') '; select /*+ firstrows index(doc_tokens) */ offset - c_length_teaser, offset + length + c_length_teaser into v_offset_begin, v_offset_end from doc_tokens where token=upper(v_token) and query_id = to_char(r.id) and rownum = 1; end if; -- p_with_highlight v_pos_begin := trunc(v_offset_begin / 300); v_pos_end := trunc(v_offset_end / 300); if v_pos_begin < v_pos_end then declare v_piece_1 Varchar2(300); v_piece_2 Varchar2(300); begin select substr(piece,v_offset_begin-300*v_pos_begin) into v_piece_1 from doc_content where id = r.id and pos=v_pos_begin; select substr(piece,1, v_offset_end-300*v_pos_end) into v_piece_2 from doc_content where id = r.id and pos=v_pos_end; v_piece := v_piece_1 || v_piece_2; end; else select substr(piece,v_offset_begin-300*v_pos_begin, (v_offset_end-300*v_pos_begin) - (v_offset_begin-300*v_pos_begin)) into v_piece from doc_content where id = r.id and pos=v_pos_begin; end if; v_result_lines(r.rn) := result_line(r.file_name,v_piece,c_length_teaser, v_offset_end-v_offset_begin-2*c_length_teaser+1); -- end loop; exception when no_data_found then v_result_lines(r.rn) := result_line(r.file_name,'- -', 0, 0); end; end loop; open result for select * from table(cast (v_result_lines as result_line_tab)); end search; end search_doc; / |