January 3, 2010 at 6:52 am
I am doing a project where I need to Mark similar records in a table with millions of records. By similar I mean some fields have same or similar values. I was first selecting all rows and then loop through all records in the table using cursors and update the record with duplicateid and type of match based on which fields were used to do the matching which worked fine but then I used a subquery because looping through cursors was very slow.
Here is the create statement
CREATE TABLE [dbo].[mjtest](
[InvoiceId] [int] IDENTITY(1,1) NOT NULL,
[ProjectId] [int] NOT NULL,
[CompanyCode] [varchar](2000) COLLATE SQL_Latin1_General_CP1_CI_AS NULL,
[VendorNumber] [varchar](2000) COLLATE SQL_Latin1_General_CP1_CI_AS NULL,
[CheckNumber] [varchar](2000) COLLATE SQL_Latin1_General_CP1_CI_AS NULL,
[InvoiceNumber] [varchar](2000) COLLATE SQL_Latin1_General_CP1_CI_AS NULL,
[VoucherNumber] [text] COLLATE SQL_Latin1_General_CP1_CI_AS NULL,
[DocumentNumber] [text] COLLATE SQL_Latin1_General_CP1_CI_AS NULL,
[Ap1] [text] COLLATE SQL_Latin1_General_CP1_CI_AS NULL,
[Ap2] [text] COLLATE SQL_Latin1_General_CP1_CI_AS NULL,
[Ap3] [text] COLLATE SQL_Latin1_General_CP1_CI_AS NULL,
[UserId] [text] COLLATE SQL_Latin1_General_CP1_CI_AS NULL,
[PurchaseOrdNumber] [varchar](2000) COLLATE SQL_Latin1_General_CP1_CI_AS NULL,
[CheckStatus] [varchar](2000) COLLATE SQL_Latin1_General_CP1_CI_AS NULL,
[VendorName] [varchar](2000) COLLATE SQL_Latin1_General_CP1_CI_AS NULL,
[InvoiceDate] [datetime] NULL,
[InvoiceNo] [text] COLLATE SQL_Latin1_General_CP1_CI_AS NULL,
[CheckDate] [datetime] NULL,
[InvoiceAmount] [float] NULL,
[CheckAmount] [float] NULL,
[tagging] [varchar](250) COLLATE SQL_Latin1_General_CP1_CI_AS NULL CONSTRAINT [DF_mjtest_tagging] DEFAULT ('no'),
[auto_tag] [varchar](250) COLLATE SQL_Latin1_General_CP1_CI_AS NULL,
[duplicate_type] [varchar](250) COLLATE SQL_Latin1_General_CP1_CI_AS NULL CONSTRAINT [DF_mjtest_duplicate_type] DEFAULT ('no'),
[convertion_done] [tinyint] NULL CONSTRAINT [DF_mjtest_convertion_done] DEFAULT ((0)),
[duplicate_with] [bigint] NULL CONSTRAINT [DF_mjtest_duplicate_with] DEFAULT ((0)),
[StatusId] [int] NULL CONSTRAINT [DF_mjtest_StatusId] DEFAULT ((0)),
[unmark] [tinyint] NULL,
[reason] [varchar](max) COLLATE SQL_Latin1_General_CP1_CI_AS NULL,
[discount] [float] NULL,
[tagstatus] [int] NULL CONSTRAINT [DF_mjtest_tagstatus] DEFAULT ((0)),
[sortbyid] [bigint] NULL,
[chkfornegative] [tinyint] NULL CONSTRAINT [DF_mjtest_chkfornegative] DEFAULT ((0)),
[id] [bigint] NULL,
CONSTRAINT [PK_dbo.mjtest] PRIMARY KEY CLUSTERED
(
[InvoiceId] ASC
)WITH (IGNORE_DUP_KEY = OFF) ON [PRIMARY]
) ON [PRIMARY] TEXTIMAGE_ON [PRIMARY]
Insert statements
INSERT INTO [mjtest] ([InvoiceId],[ProjectId],[CompanyCode],[VendorNumber],[CheckNumber],[InvoiceNumber],[VoucherNumber],[DocumentNumber],[Ap1],[Ap2],[Ap3],[UserId],[PurchaseOrdNumber],[CheckStatus],[VendorName],[InvoiceDate],[InvoiceNo],[CheckDate],[InvoiceAmount],[CheckAmount],[tagging],[auto_tag],[duplicate_type],[convertion_done],[duplicate_with],[StatusId],[unmark],[reason],[discount],[tagstatus],[sortbyid],[chkfornegative],[id])VALUES(1,50,'MTGI3','NOR03','A-286','286','16270','16270','a1414','b1211','c15281 ','11111','A134590','P','NORTHEAST SALES DIST INC','Oct 15 2008 12:00:00:000AM',NULL,'Oct 16 2008 12:00:00:000AM',-5.220000000000000e+002,-5.220000000000000e+002,'TD','YES','TRUE DUPLICATE',0,1,0,0,NULL,4.800000000000000e+001,0,1,0,0)
INSERT INTO [mjtest] ([InvoiceId],[ProjectId],[CompanyCode],[VendorNumber],[CheckNumber],[InvoiceNumber],[VoucherNumber],[DocumentNumber],[Ap1],[Ap2],[Ap3],[UserId],[PurchaseOrdNumber],[CheckStatus],[VendorName],[InvoiceDate],[InvoiceNo],[CheckDate],[InvoiceAmount],[CheckAmount],[tagging],[auto_tag],[duplicate_type],[convertion_done],[duplicate_with],[StatusId],[unmark],[reason],[discount],[tagstatus],[sortbyid],[chkfornegative],[id])VALUES(2,50,'MTGI3','NOR03','5315','286','16370','16370','a1415','b1212','c15282 ','11112','A134591','V','NORTHEAST SALES DIST INC','Oct 15 2008 12:00:00:000AM',NULL,'Nov 14 2008 12:00:00:000AM',2.000000000000000e-002,2.000000000000000e-002,'TDCR','MATCHED','MATCHED CREDIT',0,2,0,0,NULL,4.800000000000000e+001,0,2,0,0)
INSERT INTO [mjtest] ([InvoiceId],[ProjectId],[CompanyCode],[VendorNumber],[CheckNumber],[InvoiceNumber],[VoucherNumber],[DocumentNumber],[Ap1],[Ap2],[Ap3],[UserId],[PurchaseOrdNumber],[CheckStatus],[VendorName],[InvoiceDate],[InvoiceNo],[CheckDate],[InvoiceAmount],[CheckAmount],[tagging],[auto_tag],[duplicate_type],[convertion_done],[duplicate_with],[StatusId],[unmark],[reason],[discount],[tagstatus],[sortbyid],[chkfornegative],[id])VALUES(3,50,'MTGI3','NOR03','5315','286','16339','16339','a1416','b1213','c15283 ','11113','A134592','P','NORTHEAST SALES DIST INC','Oct 15 2008 12:00:00:000AM',NULL,'Nov 13 2008 12:00:00:000AM',-2.000000000000000e-002,-2.000000000000000e-002,'TDCR','MATCHED','MATCHED CREDIT',0,2,0,0,NULL,4.800000000000000e+001,0,2,0,0)
INSERT INTO [mjtest] ([InvoiceId],[ProjectId],[CompanyCode],[VendorNumber],[CheckNumber],[InvoiceNumber],[VoucherNumber],[DocumentNumber],[Ap1],[Ap2],[Ap3],[UserId],[PurchaseOrdNumber],[CheckStatus],[VendorName],[InvoiceDate],[InvoiceNo],[CheckDate],[InvoiceAmount],[CheckAmount],[tagging],[auto_tag],[duplicate_type],[convertion_done],[duplicate_with],[StatusId],[unmark],[reason],[discount],[tagstatus],[sortbyid],[chkfornegative],[id])VALUES(4,50,'MTGI3','NOR03','5328','286','16425','16425','a1417','b1214','c15284 ','11114','A134593','P','NORTHEAST SALES DIST INC','Oct 15 2008 12:00:00:000AM',NULL,'Nov 26 2008 12:00:00:000AM',-5.220000000000000e+002,-5.220000000000000e+002,'TD','YES','TRUE DUPLICATE',0,1,0,0,NULL,4.800000000000000e+001,0,1,0,0)
INSERT INTO [mjtest] ([InvoiceId],[ProjectId],[CompanyCode],[VendorNumber],[CheckNumber],[InvoiceNumber],[VoucherNumber],[DocumentNumber],[Ap1],[Ap2],[Ap3],[UserId],[PurchaseOrdNumber],[CheckStatus],[VendorName],[InvoiceDate],[InvoiceNo],[CheckDate],[InvoiceAmount],[CheckAmount],[tagging],[auto_tag],[duplicate_type],[convertion_done],[duplicate_with],[StatusId],[unmark],[reason],[discount],[tagstatus],[sortbyid],[chkfornegative],[id])VALUES(5,50,'MTGI3','NOR03','7899','DM00000286','16426','16426','a1418','b1215','c15285 ','11115','A134594','D','NORTHEAST SALES DIST INC','Oct 15 2008 12:00:00:000AM',NULL,'Oct 15 2008 12:00:00:000AM',2.000000000000000e-002,2.000000000000000e-002,'no','no',NULL,0,0,0,0,NULL,4.800000000000000e+001,0,22,0,0)
INSERT INTO [mjtest] ([InvoiceId],[ProjectId],[CompanyCode],[VendorNumber],[CheckNumber],[InvoiceNumber],[VoucherNumber],[DocumentNumber],[Ap1],[Ap2],[Ap3],[UserId],[PurchaseOrdNumber],[CheckStatus],[VendorName],[InvoiceDate],[InvoiceNo],[CheckDate],[InvoiceAmount],[CheckAmount],[tagging],[auto_tag],[duplicate_type],[convertion_done],[duplicate_with],[StatusId],[unmark],[reason],[discount],[tagstatus],[sortbyid],[chkfornegative],[id])VALUES(6,50,'MTGI3','NOR03','A-286','286','16353','16353','a1419','b1216','c15286 ','11116','A134595','V','NORTHEAST SALES DIST INC','Oct 15 2008 12:00:00:000AM',NULL,'Nov 13 2008 12:00:00:000AM',5.220000000000000e+002,5.220000000000000e+002,'TD','YES','TRUE DUPLICATE',0,6,0,0,NULL,4.800000000000000e+001,0,1,0,0)
INSERT INTO [mjtest] ([InvoiceId],[ProjectId],[CompanyCode],[VendorNumber],[CheckNumber],[InvoiceNumber],[VoucherNumber],[DocumentNumber],[Ap1],[Ap2],[Ap3],[UserId],[PurchaseOrdNumber],[CheckStatus],[VendorName],[InvoiceDate],[InvoiceNo],[CheckDate],[InvoiceAmount],[CheckAmount],[tagging],[auto_tag],[duplicate_type],[convertion_done],[duplicate_with],[StatusId],[unmark],[reason],[discount],[tagstatus],[sortbyid],[chkfornegative],[id])VALUES(7,50,'MTGI3','RAM03','5318','CK RQST 10/31/0','16342','16342','a1420','b1217','c15287 ','11117','A134596','P','FELIPE RAMIREZ','Oct 31 2008 12:00:00:000AM',NULL,'Nov 13 2008 12:00:00:000AM',-4.136400000000000e+002,-4.136400000000000e+002,'no','no',NULL,0,0,0,0,NULL,4.800000000000000e+001,0,22,0,0)
INSERT INTO [mjtest] ([InvoiceId],[ProjectId],[CompanyCode],[VendorNumber],[CheckNumber],[InvoiceNumber],[VoucherNumber],[DocumentNumber],[Ap1],[Ap2],[Ap3],[UserId],[PurchaseOrdNumber],[CheckStatus],[VendorName],[InvoiceDate],[InvoiceNo],[CheckDate],[InvoiceAmount],[CheckAmount],[tagging],[auto_tag],[duplicate_type],[convertion_done],[duplicate_with],[StatusId],[unmark],[reason],[discount],[tagstatus],[sortbyid],[chkfornegative],[id])VALUES(8,50,'MTGI3','SYS03','101214','810220564','17146','17146','a1421','b1218','c15288 ','11118','A134597','P','SYSCO FOOD SVS OF ATLANTA','Nov 1 2008 12:00:00:000AM',NULL,'Dec 16 2008 12:00:00:000AM',-8.834999999999999e+001,-8.834999999999999e+001,'A2','YES','INVOICE DATE',0,8,0,0,NULL,4.800000000000000e+001,0,11,0,0)
INSERT INTO [mjtest] ([InvoiceId],[ProjectId],[CompanyCode],[VendorNumber],[CheckNumber],[InvoiceNumber],[VoucherNumber],[DocumentNumber],[Ap1],[Ap2],[Ap3],[UserId],[PurchaseOrdNumber],[CheckStatus],[VendorName],[InvoiceDate],[InvoiceNo],[CheckDate],[InvoiceAmount],[CheckAmount],[tagging],[auto_tag],[duplicate_type],[convertion_done],[duplicate_with],[StatusId],[unmark],[reason],[discount],[tagstatus],[sortbyid],[chkfornegative],[id])VALUES(9,50,'MTGI3','NOR03','5328','286','16725','16725','a1422','b1219','c15289 ','11119','A134598','V','NORTHEAST SALES DIST INC','Oct 15 2008 12:00:00:000AM',NULL,'Nov 30 2008 12:00:00:000AM',5.220000000000000e+002,5.220000000000000e+002,'TD','YES','TRUE DUPLICATE',0,6,0,0,NULL,4.800000000000000e+001,0,1,0,0)
INSERT INTO [mjtest] ([InvoiceId],[ProjectId],[CompanyCode],[VendorNumber],[CheckNumber],[InvoiceNumber],[VoucherNumber],[DocumentNumber],[Ap1],[Ap2],[Ap3],[UserId],[PurchaseOrdNumber],[CheckStatus],[VendorName],[InvoiceDate],[InvoiceNo],[CheckDate],[InvoiceAmount],[CheckAmount],[tagging],[auto_tag],[duplicate_type],[convertion_done],[duplicate_with],[StatusId],[unmark],[reason],[discount],[tagstatus],[sortbyid],[chkfornegative],[id])VALUES(10,50,'MTGI3','SYS03','101218','812231389','17160','17160','a1423','b1220','c15290 ','11120','A134599','P','SYSCO FOOD SVS OF ATLANTA','Dec 23 2008 12:00:00:000AM',NULL,'Dec 30 2008 12:00:00:000AM',-9.509270000000000e+003,-9.509270000000000e+003,'no','no',NULL,0,0,0,0,NULL,4.800000000000000e+001,0,22,0,0)
THis is the subquery method I am using to find duplicates and update them. THere are cases when there more than 1 duplicate records in the table and I want all of them to have the same duplicate_with so I can group them together.
UPDATE mjtest
SET duplicate_with = MyAlias.InvoiceID, duplicate_type = 'TRUE DUPLICATE'
FROM (Select top 100 percent InvoiceID, projectid, VendorNumber, CheckNumber, InvoiceNumber, InvoiceDate,InvoiceAmount,VendorName
from mjtest order by invoiceid desc) MyAlias
where mjtest.InvoiceID <> MyAlias.InvoiceID
AND (mjtest.duplicate_with is null or mjtest.duplicate_with = 0)
AND mjtest.VendorNumber = MyAlias.VendorNumber
AND mjtest.VendorName = MyAlias.VendorName
AND mjtest.InvoiceNumber = MyAlias.InvoiceNumber
AND mjtest.InvoiceDate = MyAlias.InvoiceDate
AND mjtest.InvoiceAmount = MyAlias.InvoiceAmount
AND mjtest.projectid = MyAlias.projectid
Eg: If Invoiceid 1,2 and 4 were duplicates the duplicate_with field for invoiceid 1 gets updated to with value 4 and duplicate_with for 2 and 4 updates to value 1. What I want is for all 3 rows to have the lower invoiceid value for duplicate_with field which is 1 so I can group them together. How can I do that?
January 4, 2010 at 3:15 am
Your insert throws an error I guess(may be because of the identitiy insert)
Anyhow, if I am anywhere close to what you are looking for, see if this helps please.
Select * from (Select VendorNumber,VendorName ,InvoiceNumber ,
InvoiceDate , InvoiceAmount ,projectid, duplicate_with,
Row_number() OVER (Partition by
VendorNumber,VendorName ,InvoiceNumber ,
InvoiceDate , InvoiceAmount ,projectid ORDER BY
invoiceId desc) as row_num
from #mjtest) T1
Where Row_num = 1
This way you can identify the lowest invoiceID amongst duplicates and use it to update your duplicate_with column?
Let us know if this is not clear. Thanks.
---------------------------------------------------------------------------------
January 4, 2010 at 8:10 pm
This looks that it might work. You can change Identity insert to on before inserting records. I am trying to write a update statement. Can you modify this select to update statement?
January 4, 2010 at 10:10 pm
See if this works. I used what Nabha posted. apologize for the mixed case.
UPDATE M1
SET M1.duplicate_with = M2.INVOICEID
FROM MJTEST M1 JOIN
(
Select InvoiceId, VendorNumber,VendorName ,InvoiceNumber ,
InvoiceDate , InvoiceAmount ,projectid, duplicate_with from (Select InvoiceId, VendorNumber,VendorName ,InvoiceNumber ,
InvoiceDate , InvoiceAmount ,projectid, duplicate_with,
Row_number() OVER (Partition by
VendorNumber,VendorName ,InvoiceNumber ,
InvoiceDate , InvoiceAmount ,projectid ORDER BY
invoiceId ASC) as row_num
from mjtest
) T1
Where Row_num = 1
) M2
ON m1.VendorNumber = m2.VendorNumber
AND m1.VendorName = m2.VendorName
AND m1.InvoiceNumber = m2.InvoiceNumber
AND m1.InvoiceDate = m2.InvoiceDate
AND m1.InvoiceAmount = m2.InvoiceAmount
AND m1.projectid = m2.projectid
Viewing 4 posts - 1 through 3 (of 3 total)
You must be logged in to reply to this topic. Login to reply