We use an Oracle database version 11.2.0.4
We have to index a lot of company names and we want to exclude words like: ltd, inc, reg, company, lp, gp... with a stoplist
In the example below the word John is a stopword. Why this word is indexed when we use NDATA?
drop table people;
create table people (full_name varchar2(2000));
insert into people values ('John Black Smith');
-- multi_column datastore is a convenient way of adding section tags around our data
exec ctx_ddl.drop_preference('name_ds')
begin
ctx_ddl.create_preference('name_ds', 'MULTI_COLUMN_DATASTORE');
ctx_ddl.set_attribute('name_ds', 'COLUMNS', 'full_name');
end;
/
exec ctx_ddl.drop_section_group('name_sg');
begin
ctx_ddl.create_section_group('name_sg', 'BASIC_SECTION_GROUP');
ctx_ddl.add_ndata_section('name_sg', 'full_name', 'full_name');
end;
/
-- You can optionally load a thesaurus of nicknames
-- HOST ctxload -thes -name nicknames -file nicknames.txt
exec ctx_ddl.drop_preference('name_wl');
begin
ctx_ddl.create_preference('name_wl', 'BASIC_WORDLIST');
ctx_ddl.set_attribute('name_wl', 'NDATA_ALTERNATE_SPELLING', 'FALSE');
ctx_ddl.set_attribute('name_wl', 'NDATA_BASE_LETTER', 'TRUE');
-- Include the following line only if you have loaded the thesaurus
-- file
nicknames.txt:
--
ctx_ddl.set_attribute('name_wl', 'NDATA_THESAURUS', 'nicknames');
ctx_ddl.set_attribute('name_wl', 'NDATA_JOIN_PARTICLES',
'de:di:la:da:el:del:qi:abd:los:la:dos:do:an:li:yi:yu:van:jon:un:sai:ben:al');
end;
/
begin
ctx_ddl.drop_stoplist('TEST_STOPLIST');
end;
begin
ctx_ddl.create_stoplist('TEST_STOPLIST','BASIC_STOPLIST');
ctx_ddl.add_stopword('TEST_STOPLIST','John');
END;
drop index people_idx;
create index people_idx on people(full_name) indextype is ctxsys.context parameters ('datastore name_ds section group name_sg wordlist name_wl STOPLIST
TEST_STOPLIST');
-- Now you can do name searches with the following SQL:
var name varchar2(80);
exec :name := 'John'
select distinct lower(TOKEN_TEXT), sum(token_count) FROM DR$PEOPLE_IDX$I
where UPPER(token_text) like '%JOHN%'
group by (lower(TOKEN_TEXT))
ORDER BY 1 asc;
select /*+ FIRST_ROWS */ full_name, score(1)
from people
where contains(full_name, 'ndata(
full_name, John) ',1)>0
order by score(1) desc
/
Thanks for your answers.