A google like search
The problem:
We must implement a google like search over a existing database. The database contains several tables for persons, corporations, organizations, addresses, etc.
It must be a custom solution (no thirdy-party solutions neither SQL FTS).
Its for a new web application (dozen to hundreds simultaneous users, response time under 1s for most cases, 5s for worst case).
Strategy:
We ill create a single table (registers) to point to the existing tables. We ill need also a table of tags and a relationship table (registers x tags).
A batch ill scan the relevant tables (persons, organizations, addresses, etc) and populate the tables (registers, tags, tag_register) once a day.
Rules:
The search ill be performed in all registers at once.
A "register" is a abstraction for a person a organization or any other object.
The object's data lie in a "main" table (persons table, organizations table, etc)
and in many "members" tables (addresses, contacts, profile, etc).
A "tag" is a word wich ocurrs in any of the table's text fields.
In a search for tag1, tag2 and tag3
If exists: tag1 AND tag2-100 ocurrences, tag1 AND tag3-10 ocurrences
and there's no ocurrences of tag2 and tag3 for the any register we ignore tag3
since this returns mores results than ignoring tag2.
Some objects share many properties and the search can use some properties to filtering and/or ordering,
in this example we ill simple use the name property to populate a column (vc_register) to be used for ordering.
Conclusion:
Below is the script to create the tables and procedure and populate the database with dummy data.
The procedure performs the search in a reasonable time and was implementd as a conceptual proof.
We gave a try to table variables and creating/droping indexes in temporary tables but we found no indexed temporary tables worked better.
Also the FK constraints increase performance is better than we can have guessed.
Increasing the number of tags dont impact in performance, increasing the number of registers dont impact directly in performance.
If a single tag is present in millions of registers returning all matches for it can slow down the response time.
The response time is dicted by the number os comparasions to find the intersection between to matches (join each tag search result).
Chosing wisely the words from the database to populate the tags table ill impact directly in performance.
Words like "the", "and", "a" must be avoided (low meaning, high ocurrence)
We ill must keep a eye in the most common used tags and the tags with the highest ocurrences.
/* this table points to several production tables *//* +20 columns ill be added in the final version to implement filters and order by options */create table dbo.register(
id_register int not null identity(1,1) primary key --A unique id to gather several rows from several tables helps a lot
,id_from_table int not null
,it_table char(1) not null
,vc_register varchar(200) not null -- used to order by de final results
)
create nonclustered index [ix_register-vc_register] on dbo.[register](vc_register);
GO
/* contains all words can be found in all fields from all tables */create table dbo.tag(
id_tag int identity(1,1) not null primary key nonclustered
,vc_tag varchar(50) not null -- clustered for performance
,ocurrences int not null --A count of the ocurrences of the tag, used do performance and to maximize results when one or more of the tags causes the entire search to not return matches
)
create clustered index [uix_tag-vc_tag] on dbo.[tag](vc_tag);
/* relation between words and ocurrences in the registers */create table dbo.tag_register(
id_register int not null --clustered index
,id_tag int not null --unique
)
create clustered index [uix_tag_register-id_register] on dbo.[tag_register](id_register);
create nonclustered index [ix_tag_register-id_tag] on dbo.[tag_register](id_tag);
create unique index [uix_tag_register] on dbo.[tag_register](id_register,id_tag);
alter table dbo.[tag_register] add constraint [FK_tag_register-tag] FOREIGN KEY (id_tag) references dbo.[tag] (id_tag);
alter table dbo.[tag_register] add constraint [FK_tag_register-register] FOREIGN KEY (id_register) references dbo.[register] (id_register);
/* implements a "tag's cloud" */
create table dbo.tag_dia(
id_tag int not null
,dia date not null
,qtd smallint not null
)
--create type dbo.IntArray as table
--( id int primary key nonclustered)
--GO
GO
/* search for words in millions of rows *//* rules: *//* 1.must accept n-terms to search *//* 2.must performe the search *//* in multiple tables at once *//* 3.performe not inclusive search *//* (term1 AND term2 AND ...) in opposition *//* to (term1 OR term2 OR ...) *//* 4.if a term causes the entire search to *//* fail ignore it *//* 5.response time must be less than 1s for *//* most cases, 5s to the worst scenario *//* (intersection of multiple 100k matches) */
create procedure dbo.SearchExclusive(@SearchTerm varchar(max))
as begin
set nocount on
declare @id_tag int--, @registers IntArray, @registers_temp IntArray;
create table #registers (id int);
--loop for each search term
DECLARE SearchTermCursor cursor FAST_FORWARD READ_ONLY for
select t.id_tag from tag t
join dbo.Split(@SearchTerm,';') s on s.vc_item = t.vc_tag
order by t.ocurrences desc
OPEN SearchTermCursor
FETCH next from SearchTermCursor into @id_tag -- fetch the first row
if (@@FETCH_STATUS = 0)--if first row ok
begin
insert into #registers select tr.id_register from dbo.tag_register tr where tr.id_tag = @id_tag
--create clustered index [uixid] on #registers(id)
FETCH next from SearchTermCursor into @id_tag -- fetch the second row
end
WHILE (@@FETCH_STATUS = 0 and exists(select id from #registers)) -- fetch # row
BEGIN
create table #registers_temp(id int)
insert into #registers_temp
select tr.id_register
from dbo.tag_register tr
join #registers r on r.id = tr.id_register
where tr.id_tag = @id_tag
--if this particular tag makes no match to be returned then ignore it
if exists(select id from #registers_temp)
begin
truncate table #registers; -- erase
--drop index #registers.[uixid]
--create clustered index [uixid_temp] on #registers_temp(id)
insert into #registers select t.id from #registers_temp t
--create clustered index [uixid] on #registers(id)
end
drop table #registers_temp
FETCH next from SearchTermCursor into @id_tag
END
close SearchTermCursor
deallocate SearchTermCursor
--create clustered index [uixid] on #registers(id)
--return ill be paged
select top 200 r.* from dbo.register r
join #registers t on t.id = r.id_register
order by r.vc_register
drop table #registers
--drop table #registers_temp
set nocount off
end
GO
/* Correctness testing data */
insert into register values(0,'','abc3')
insert into register values(0,'','abc2')
insert into register values(0,'','abc1')
insert into tag values('abc',3);--tag 1
insert into tag_register values(1,1);
insert into tag_register values(2,1);
insert into tag_register values(3,1);
--select * from tag_register
insert into register values(0,'','def1')
insert into register values(0,'','def2')
insert into register values(0,'','def3')
insert into register values(0,'','def4')
insert into tag values('def',4);--tag 1
insert into tag_register values(4,2);
insert into tag_register values(5,2);
insert into tag_register values(6,2);
insert into tag_register values(7,2);
insert into register values(0,'','abcdef1')
insert into register values(0,'','abcdef3')
insert into register values(0,'','abcdef2')
insert into register values(0,'','abcdef4')
insert into tag_register values(8,2);
insert into tag_register values(9,2);
insert into tag_register values(10,2);
insert into tag_register values(11,2);
insert into tag_register values(8,1);
insert into tag_register values(9,1);
insert into tag_register values(10,1);
insert into tag_register values(11,1);
insert into tag values('qwerty',1000);
insert into tag values('xyz',10000);
--select * from tag
insert into tag values('cross alfa',100000);
insert into tag values('cross beta',100000);
insert into tag values('cross gamma',100000);
insert into tag values('cross delta',100000);
GO
/* perfomance testing data */set nocount on
GO
declare @id int
set @id = (select COUNT(*) from tag)
while(@id < 10000)
begin
set @id = @id +1;
insert into tag values('tag '+CAST(@id as varchar(10)), 0)
end
GO
--decreate the clustered indexes
--inserting can take several minutes to populate thousands rows
drop index dbo.[tag_register].[uix_tag_register-id_register];
drop index dbo.[tag_register].[ix_tag_register-id_tag];
GO
declare @id int
set @id = (select COUNT(*) from register)
while(@id < 2000000)
begin
set @id = @id +1;
insert into register values(0,'','cross'+CAST(@id as varchar(10)))
insert into tag_register values(@id,5);
insert into tag_register values(@id,6);
insert into tag_register values(@id,7);
insert into tag_register values(@id,8);
end
GO
create clustered index [uix_tag_register-id_register] on dbo.[tag_register](id_register);
create nonclustered index [ix_tag_register-id_tag] on dbo.[tag_register](id_tag);
GO
drop index dbo.[tag].[uix_tag-vc_tag];
GO
declare @idt int, @idr int
set @idt = 1000
while(@idt < 2000)
begin
set @idr = 1000
set @idt = @idt +1;
while(@idr < 2000)
begin
set @idr = @idr +1;
insert into tag_register values(@idr,@idt);
end
end
GO
create clustered index [uix_tag-vc_tag] on dbo.[tag](vc_tag);
GO
set nocount off
GO
/* testing */
exec dbo.SearchExclusive 'none?;';--must return no results
GO
exec dbo.SearchExclusive 'abc;';
GO
exec dbo.SearchExclusive 'def;none';--must ignore none
GO
exec dbo.SearchExclusive 'def;abc;';
GO
exec dbo.SearchExclusive 'def;abc;qwerty;';
GO
exec dbo.SearchExclusive 'qwerty;';
GO
exec dbo.SearchExclusive 'cross alfa';
GO
exec dbo.SearchExclusive 'cross alfa;cross beta;cross gamma;cross delta';
GO
exec dbo.SearchExclusive 'cross alfa;cross beta;fnord;qwerty;def;abc;qwerty;cross gamma;cross delta;';
GO
exec dbo.SearchExclusive 'tag 1001;tag 1002;tag 1003;tag 1003;tag 1004;tag 1005;tag 1006;tag 1007;tag 1008;tag 1009;tag 1010;tag 1011;tag 1012;tag 1013;tag 1014;tag 1015;tag 1016;tag 1017;tag 1018;tag 1019;tag 1020;';
GO