René Nyffenegger's collection of things on the web
René Nyffenegger on Oracle - Most wanted - Feedback
 

Fuzzy search with Oracle

A common problem with German (last-) names is that it is not always clear how exactly they are spelled. I guess the same goes for other languages as well, but living (and working) in Switzerland, I am encountering this problem mostly with german names and a bit also with french names.
The following function demonstrates how that problem can be partly solved when searching for a name.
create_func.sql
create or replace function 
  fuzzy_search(p_input_s in varchar2) return varchar2 deterministic 
is
  v_string_s    varchar2(200);
  v_len         number;
  last_2        varchar2(2);
begin
  v_string_s := upper     (p_input_s);
  
  v_string_s := replace   (v_string_s,'SCH', 'H' );
  v_string_s := replace   (v_string_s,'CH' , 'H' );
  v_string_s := replace   (v_string_s,'TS' , 'Z' );
 
  v_string_s := translate (v_string_s,'J'  , 'I' );
  v_string_s := translate (v_string_s,'Y'  , 'I' );
 
  v_string_s := replace   (v_string_s,'AI' , 'EI');

  v_string_s := replace   (v_string_s,'IE' , 'I' );
  
  v_string_s := replace   (v_string_s,'PH' , 'F' );
  v_string_s := translate (v_string_s,'V'  , 'F' );
  
  v_string_s := replace   (v_string_s,'PE' , 'B' );
  
  v_string_s := translate (v_string_s,'P'  , 'B' );
  
  v_string_s := replace   (v_string_s,'CK' , 'K' );
  v_string_s := translate (v_string_s,'Q'  , 'K' );
  v_string_s := replace   (v_string_s,'QU' , 'K' );
  v_string_s := translate (v_string_s,'C'  , 'K' );
  
  v_string_s := replace   (v_string_s,'TZ' , 'Z' );
  v_string_s := replace   (v_string_s,'ZT' , 'Z' );
  
  v_string_s := replace   (v_string_s,'TD' , 'D' );
  v_string_s := replace   (v_string_s,'DT' , 'D' );
  v_string_s := translate (v_string_s,'D'  , 'D' );
 
  -- French names
  v_len         := length(v_string_s);
  last_2        := substr(v_string_s,v_len-1);

  if last_2 in ('AZ', 'AT', 'EZ', 'ET') then
     v_string_s := substr(v_string_s,1,v_len-1);
  end if;
  
  -- "Double" letters
  v_string_s := replace   (v_string_s,'MM', 'M' );
  v_string_s := replace   (v_string_s,'NN', 'N' );
  v_string_s := replace   (v_string_s,'KK', 'K' );
  v_string_s := replace   (v_string_s,'BB', 'B' );
  v_string_s := replace   (v_string_s,'FF', 'F' );
  v_string_s := replace   (v_string_s,'DD', 'D' );
  v_string_s := replace   (v_string_s,'GG', 'G' );
  v_string_s := replace   (v_string_s,'LL', 'L' );
  v_string_s := replace   (v_string_s,'SS', 'S' );
  v_string_s := replace   (v_string_s,'RR', 'R' );

  v_string_s := replace   (v_string_s,'.' , ''  );
  v_string_s := replace   (v_string_s,' ' , ''  );
  
  return v_string_s;

end fuzzy_search;
/
create_table.sql
create table names_ (
  id    number,
  name  varchar2(40)
);
ins_table.sql
insert into names_ values (1,   'Hofmann' );
insert into names_ values (2,   'Hoffmann');

insert into names_ values (3,   'Meyer'   );
insert into names_ values (4,   'Maier'   );
insert into names_ values (5,   'Meier'   );

insert into names_ values (6,   'Saffran' );
insert into names_ values (7,   'Saphran' );

insert into names_ values (8,   'A. Basler');

commit;

Selecting from the table

select * from names_ where fuzzy_search(name) = fuzzy_search('hofmann');
No distinction between single letters and doubled letters:
        ID NAME
---------- ----------------------------------------
         1 Hofmann
         2 Hoffmann
Similarity between ey, ay and ei
select * from names_ where fuzzy_search(name) = fuzzy_search('meier');
        ID NAME
---------- ----------------------------------------
         3 Meyer
         4 Maier
         5 Meier
Similarity between ph and f
select * from names_ where fuzzy_search(name) = fuzzy_search('safran');
        ID NAME
---------- ----------------------------------------
         6 Saffran
         7 Saphran
Abbreviations with dots and abbreviations without a dot:
select * from names_ where fuzzy_search(name) = fuzzy_search('a basler');
        ID NAME
---------- ----------------------------------------
         8 A. Basler

Thanks

Thanks to Joachim Schuler who improved the code on this page.

Links

See also soundex and utl_match.