March 20, 2008 at 3:22 pm
Hello ,
I'm trying to create a procedure that will remove duplicates from the customers_dim_test table.
This is the table structure
CREATE TABLE [dbo].[Customers_Dim_Test](
[Customer Key] [varchar](50) NULL,
[CustomerID] [int] NULL,
[firstname] [varchar](50) NULL,
[lastname] [varchar](50) NULL,
[varchar](50) NULL,
[address] [varchar](max) NULL,
[city] [varchar](50) NULL,
[state] [varchar](50) NULL,
[country] [varchar](50) NULL,
[zip] [varchar](50) NULL,
[socialsecurity] [varchar](50) NULL,
[optIn] [bit] NULL,
[placeToContact] [varchar](50) NULL,
[timeToContact] [varchar](50) NULL,
[unsubscribed] [char](10) NULL,
[bounced] [char](10) NULL,
[tcAccepted] [bit] NULL,
[tcAcceptedDate] [datetime] NULL,
[tcAcceptedIP] [varchar](50) NULL,
[lastLogin] [datetime] NULL,
[passwordsetup] [bit] NULL,
[datranSent] [bit] NULL,
[advaliantSent] [bit] NULL,
[coverClicksent] [bit] NULL,
[customerReferenceID] [int] NULL,
[referenceFirstName] [varchar](50) NULL,
[referenceLastName] [varchar](50) NULL,
[referencePhone] [varchar](50) NULL,
[referenceRelationship] [varchar](50) NULL,
[customerPhoneID] [int] NULL,
[phoneNumber] [varchar](50) NULL,
[phoneType] [varchar](50) NULL,
[customerPersonalID] [int] NULL,
[mothersMaiden] [varchar](50) NULL,
[birthYear] [smallint] NULL,
[birthMonth] [tinyint] NULL,
[birthDay] [tinyint] NULL,
[gender] [varchar](50) NULL,
[issuedIDState] [varchar](50) NULL,
[maritalStatus] [varchar](50) NULL,
[UScitizen] [bit] NULL,
[activeMilitary] [bit] NULL,
[customerHousingID] [int] NULL,
[ownHome] [bit] NULL,
[yearsAtResidence] [tinyint] NULL,
[monthsAtResidence] [tinyint] NULL,
[monthlypayment] [smallmoney] NULL,
[customerFinancialID] [int] NULL,
[foreclosure] [bit] NULL,
[bankruptcyEver] [bit] NULL,
[bankruptcyNow] [bit] NULL,
[nonSufficient2Months] [nchar](10) NULL,
[nonSufficient6Months] [bit] NULL,
[netMonthlyIncome] [money] NULL,
[incomeSource] [varchar](50) NULL,
[net1000] [bit] NULL,
[cosignerAvailable] [bit] NULL,
[customerEmploymentID] [int] NULL,
[occupation] [varchar](50) NULL,
[selfEmployed] [bit] NULL,
[timeWithEmployerYears] [tinyint] NULL,
[timeWithEmployerMonths] [tinyint] NULL,
[overtimeOrBonus] [bit] NULL,
[directDepositCurrently] [bit] NULL,
[directDepositOffered] [bit] NULL,
[payPeriod] [varchar](50) NULL,
[nextPayday] [smalldatetime] NULL,
[secondPayday] [smalldatetime] NULL,
[thirdPayday] [smalldatetime] NULL,
[fourthPayday] [smalldatetime] NULL,
[wagesGarnished] [bit] NULL,
[workHours] [varchar](50) NULL,
[workStatus] [char](10) NULL,
[emphistory3mnths] [bit] NULL,
[workExt] [varchar](50) NULL,
[middleinitial] [varchar](50) NULL,
[username] [varchar](50) NULL,
[password] [varchar](50) NULL,
[issuedIDNumber] [varchar](50) NULL
) ON [PRIMARY]
The primary key here is the customerid.
A duplicate can be
1]Two records having the same firstname, lastname, email
2]Having the same firstname, lastname, address
so on till a all duplicates on the basis of firstname, lastname, email, socialsecurity, address are removed.
The procedure is as as follows -------------------------------------
CREATE PROCEDURE delete_cust_duplicates
AS
BEGIN
SET NOCOUNT ON;
create table #tempduplicatedata
(
customerID integer,
firstname varchar(50),
lastname varchar(50),
Address varchar(50),
socialsecurity varchar(50),
email varchar(50)
)
Insert into #tempduplicatedata
select max(customerid),firstname, lastname, email
from customers_Dim_test cCREATE PROCEDURE delete_cust_duplicates
AS
BEGIN
SET NOCOUNT ON;
create table #tempduplicatedata
(
customerID integer,
firstname varchar(50),
lastname varchar(50),
Address varchar(50),
socialsecurity varchar(50),
email varchar(50)
)
Insert into #tempduplicatedata
select max(customerid),firstname, lastname, email
from customers_Dim_test c
group by firstname,lastname,email
having count(*) > 1
-----------deleteing duplicates that have the same firstname, lastname, emailaddress
delete from customers_dim_test
from customers_dim_test c
inner join #tempduplicatedata as tpdata
on c.firstname = tpdata.firstname
and c.lastname = tpdata.lastname
and c.email = tpdata.email
insert into customers_dim_test
select firstname, lastname, email from #tempduplicatedata
------ clearing contents from the temporary table
TRUNCATE TABLE #tempduplicatedata
-------------deleteing duplicates on basis of firstname, lastname, address
Insert into #tempduplicatedata
select max(customerid),firstname, lastname, address
from customers_Dim_test c
group by firstname,lastname,address
having count(*) > 1
delete from customers_dim_test
from customers_dim_test c
inner join #tempduplicatedata as tpdata
on c.firstname = tpdata.firstname
and c.lastname = tpdata.lastname
and c.address = tpdata.address
------ clearing contents from the temporary table
insert into customers_dim_test
select * from #tempduplicatedata
TRUNCATE TABLE #tempduplicatedata
-------------deleteing duplicates on basis of firstname, lastname, email
Insert into #tempduplicatedata
select max(customerid),firstname, lastname, socialsecurity
from customers_Dim_test c
group by firstname,lastname,socialsecurity
having count(*) > 1
delete from customers_dim_test
from customers_dim_test c
inner join #tempduplicatedata as tpdata
on c.firstname = tpdata.firstname
and c.lastname = tpdata.lastname
and c.socialsecurity = tpdata.socialsecurity
insert into customers_dim_test
select * from #tempduplicatedata
TRUNCATE TABLE #tempduplicatedata
-------------deleteing duplicates on basis of firstname, email,socialsecurity
Insert into #tempduplicatedata
select max(customerid),firstname, email, socialsecurity
from customers_Dim_test c
group by firstname,email,socialsecurity
having count(*) > 1
delete from customers_dim_test
from customers_dim_test c
inner join #tempduplicatedata as tpdata
on c.firstname = tpdata.firstname
and c.email = tpdata.email
and c.socialsecurity = tpdata.socialsecurity
insert into customers_dim_test
select * from #tempduplicatedata
TRUNCATE TABLE #tempduplicatedata
-------------deleteing duplicates on basis of lastname, email,socialsecurity
Insert into #tempduplicatedata
select max(customerid),lastname, email, socialsecurity
from customers_Dim_test c
group by lastname,email,socialsecurity
having count(*) > 1
delete from customers_dim_test
from customers_dim_test c
inner join #tempduplicatedata as tpdata
on c.lastname = tpdata.lastname
and c.email = tpdata.email
and c.socialsecurity = tpdata.socialsecurity
insert into customers_dim_test
select * from #tempduplicatedata
TRUNCATE TABLE #tempduplicatedata
-------------deleteing duplicates on basis of firstname, email,address
Insert into #tempduplicatedata
select max(customerid),firstname, email, address
from customers_Dim_test c
group by lastname,email,address
having count(*) > 1
delete from customers_dim_test
from customers_dim_test c
inner join #tempduplicatedata as tpdata
on c.firstname = tpdata.firstname
and c.email = tpdata.email
and c.address = tpdata.address
insert into customers_dim_test
select * from #tempduplicatedata
TRUNCATE TABLE #tempduplicatedat
-------------deleteing duplicates on basis of lastname, email,address
Insert into #tempduplicatedata
select max(customerid),lastname, email, address
from customers_Dim_test ci
group by lastname,email,address
having count(*) > 1
delete from customers_dim_test
from customers_dim_test c
inner join #tempduplicatedata as tpdata
on c.lastname = tpdata.lasttname
and c.email = tpdata.email
and c.address = tpdata.address
insert into customers_dim_test
select * from #tempduplicatedata
TRUNCATE TABLE #tempduplicatedat
-------------deleteing duplicates on basis of lastname, email,address
Insert into #tempduplicatedata
select max(customerid),socialsecurity, email, address
from customers_Dim_test ci
group by socialsecurity,email,address
having count(*) > 1
delete from customers_dim_test
from customers_dim_test c
inner join #tempduplicatedata as tpdata
on c.socialsecurity = tpdata.lasttname
and c.email = tpdata.email
and c.address = tpdata.address
insert into customers_dim_test
select * from #tempduplicatedata
TRUNCATE TABLE #tempduplicatedat
END
GO
group by firstname,lastname,email
having count(*) > 1
-----------deleteing duplicates that have the same firstname, lastname, emailaddress
delete from customers_dim_test
from customers_dim_test c
inner join #tempduplicatedata as tpdata
on c.firstname = tpdata.firstname
and c.lastname = tpdata.lastname
and c.email = tpdata.email
insert into customers_dim_test
select firstname, lastname, email from #tempduplicatedata
------ clearing contents from the temporary table
TRUNCATE TABLE #tempduplicatedata
-------------deleteing duplicates on basis of firstname, lastname, address
Insert into #tempduplicatedata
select max(customerid),firstname, lastname, address
from customers_Dim_test c
group by firstname,lastname,address
having count(*) > 1
delete from customers_dim_test
from customers_dim_test c
inner join #tempduplicatedata as tpdata
on c.firstname = tpdata.firstname
and c.lastname = tpdata.lastname
and c.address = tpdata.address
------ clearing contents from the temporary table
insert into customers_dim_test
select * from #tempduplicatedata
TRUNCATE TABLE #tempduplicatedata
-------------deleteing duplicates on basis of firstname, lastname, email
Insert into #tempduplicatedata
select max(customerid),firstname, lastname, socialsecurity
from customers_Dim_test c
group by firstname,lastname,socialsecurity
having count(*) > 1
delete from customers_dim_test
from customers_dim_test c
inner join #tempduplicatedata as tpdata
on c.firstname = tpdata.firstname
and c.lastname = tpdata.lastname
and c.socialsecurity = tpdata.socialsecurity
insert into customers_dim_test
select * from #tempduplicatedata
TRUNCATE TABLE #tempduplicatedata
-------------deleteing duplicates on basis of firstname, email,socialsecurity
Insert into #tempduplicatedata
select max(customerid),firstname, email, socialsecurity
from customers_Dim_test c
group by firstname,email,socialsecurity
having count(*) > 1
delete from customers_dim_test
from customers_dim_test c
inner join #tempduplicatedata as tpdata
on c.firstname = tpdata.firstname
and c.email = tpdata.email
and c.socialsecurity = tpdata.socialsecurity
insert into customers_dim_test
select * from #tempduplicatedata
TRUNCATE TABLE #tempduplicatedata
-------------deleteing duplicates on basis of lastname, email,socialsecurity
Insert into #tempduplicatedata
select max(customerid),lastname, email, socialsecurity
from customers_Dim_test c
group by lastname,email,socialsecurity
having count(*) > 1
delete from customers_dim_test
from customers_dim_test c
inner join #tempduplicatedata as tpdata
on c.lastname = tpdata.lastname
and c.email = tpdata.email
and c.socialsecurity = tpdata.socialsecurity
insert into customers_dim_test
select * from #tempduplicatedata
TRUNCATE TABLE #tempduplicatedata
-------------deleteing duplicates on basis of firstname, email,address
Insert into #tempduplicatedata
select max(customerid),firstname, email, address
from customers_Dim_test c
group by lastname,email,address
having count(*) > 1
delete from customers_dim_test
from customers_dim_test c
inner join #tempduplicatedata as tpdata
on c.firstname = tpdata.firstname
and c.email = tpdata.email
and c.address = tpdata.address
insert into customers_dim_test
select * from #tempduplicatedata
TRUNCATE TABLE #tempduplicatedat
-------------deleteing duplicates on basis of lastname, email,address
Insert into #tempduplicatedata
select max(customerid),lastname, email, address
from customers_Dim_test ci
group by lastname,email,address
having count(*) > 1
delete from customers_dim_test
from customers_dim_test c
inner join #tempduplicatedata as tpdata
on c.lastname = tpdata.lasttname
and c.email = tpdata.email
and c.address = tpdata.address
insert into customers_dim_test
select * from #tempduplicatedata
TRUNCATE TABLE #tempduplicatedat
-------------deleteing duplicates on basis of lastname, email,address
Insert into #tempduplicatedata
select max(customerid),socialsecurity, email, address
from customers_Dim_test ci
group by socialsecurity,email,address
having count(*) > 1
delete from customers_dim_test
from customers_dim_test c
inner join #tempduplicatedata as tpdata
on c.socialsecurity = tpdata.lasttname
and c.email = tpdata.email
and c.address = tpdata.address
insert into customers_dim_test
select * from #tempduplicatedata
TRUNCATE TABLE #tempduplicatedat
END
GO
I attempt to delete by 1)first creating a temporary table (say B)
2)Inserting the duplicates into the temporary table
3) Deleting all from table A
4) Inserting back into A from B
However, the problem is when i execute the stored procedure i get the error
Msg 207, Level 16, State 1, Procedure delete_cust_duplicates, Line 26
Invalid column name 'firstname'.
Msg 207, Level 16, State 1, Procedure delete_cust_duplicates, Line 27
Invalid column name 'lastname'.
Msg 207, Level 16, State 1, Procedure delete_cust_duplicates, Line 28
Invalid column name 'email'.
Msg 207, Level 16, State 1, Procedure delete_cust_duplicates, Line 31
Invalid column name 'firstname'.
Msg 207, Level 16, State 1, Procedure delete_cust_duplicates, Line 31
Invalid column name 'lastname'.
Msg 207, Level 16, State 1, Procedure delete_cust_duplicates, Line 31
Invalid column name 'email'.
Msg 213, Level 16, State 1, Procedure delete_cust_duplicates, Line 31
Insert Error: Column name or number of supplied values does not match table definition.
Msg 156, Level 15, State 1, Line 1
Incorrect syntax near the keyword 'group'.
Msg 102, Level 15, State 1, Line 147
Incorrect syntax near 'END'.
Could someone please let me know what the issue is...Maybe a problem with the logic or syntax, i am not sure.
IF i need to explain again, do let me know
Thankyou
Jude
March 20, 2008 at 3:26 pm
First this is a lot of code to go through. I'd suggest you break it down and get one part working.
The insert into the temp table needs to be separate from the create procedure statement. Creating a stored procedure must be its own batch, separate from other procedures or select statements.
So
...
Go
create procedure delete_cust_duplicates
...
go
create procedure xx
...
go
What you want to do is find the row that you want to keep for all duplicates. If it's the max (customerID), then you have those IDs.
Now, delete the rows that match that row with the same firstname, lastname, email, but customerID < that id.
March 20, 2008 at 3:50 pm
Thanks for the reply,
You're right it wouldn't be easy to diagnose with all the code at once.
However, i have executed the create table and insert table statements separately and they do work.
I did not get exactly what you were suggesting with having different stored procedures.
Do you mean i just create one stored procedure to create the temporary table and one stored procedure to insert data??
Regards
Jude
March 20, 2008 at 4:23 pm
--You don't need temp table and multiple inserts to remove dublicates. Consider this scenario:
--A tbl with some dublicates:
CREATE TABLE [dbo].del_dublicates(
[RowID] [int] NULL,
[CustID] [smallint] NULL,
[CustName] [nchar](10) COLLATE SQL_Latin1_General_CP1_CI_AS NULL,
[AcctMngr] [nvarchar](50) COLLATE SQL_Latin1_General_CP1_CI_AS NULL,
[UpdateDate] [datetime] NULL
) ON [PRIMARY]
insert del_dublicates values (1, '01', 'Malivaki', 'Bill Z', Null)
insert del_dublicates values (2, '02', 'Bender', 'Partizan', Null)
insert del_dublicates values (3, '03', 'Febder', 'Luongo', Null)
insert del_dublicates values (4, '04', 'Kuriaga', 'Lebeder', Null)
insert del_dublicates values (5, '05', 'Navuxor', 'Vuorkaj', Null)
insert del_dublicates values (6, '06', 'Sergei', 'Barry Trotz', Null)
insert del_dublicates values (12, '01', 'Malivaki', 'Bill Z', Null)
insert del_dublicates values (22, '02', 'Bender', 'Partizan', Null)
insert del_dublicates values (33, '03', 'Febder', 'Luongo', Null)
insert del_dublicates values (44, '04', 'Kuriaga', 'Lebeder', Null)
insert del_dublicates values (56, '05', 'Navuxor', 'Vuorkaj', Null)
insert del_dublicates values (68, '06', 'Sergei', 'Barry Trotz', Null)
select * from del_dublicates
--suppose you want to get rid of dublicates when the yhave identical 'CustID' and 'CustName': there are 6 such pairs.
--Note all of them have unique RowID
--[as is in your case, b/c u said ther's unique key field in your table].
--running this will allow u to get rid of dublicates by this criterion by leaving recs only with only MAX(RowID):
DELETE FROM del_dublicates
--criterion: identical 'CustID' and 'Custname':
Where RowID IN
(
SELECT RowID FROM del_dublicates Tbl1 Left Join
(SELECT CustId, CustName, MAX(RowID) AS RowIDMax
FROM del_dublicates
GROUP BY CustId, CustName ) Tbl2
ON Tbl1.RowID=Tbl2.RowIDMax
WHERE Tbl2.RowIDMax IS NULL
)
result: only 6 recs are left
RowID CustID CustName AcctMngr UpdateDate
----------- ------ ---------- -------------------------------------------------- -----------------------
12 1 Malivaki Bill Z NULL
22 2 Bender Partizan NULL
33 3 Febder Luongo NULL
44 4 Kuriaga Lebeder NULL
56 5 Navuxor Vuorkaj NULL
68 6 Sergei Barry Trotz NULL
(6 row(s) affected)
March 20, 2008 at 5:14 pm
Thanks, it looks very interesting . I apologize but i did not get completely understand the last line (in bold)
SELECT RowID FROM del_dublicates Tbl1 Left Join
(SELECT CustId, CustName, MAX(RowID) AS RowIDMax
FROM del_dublicates
GROUP BY CustId, CustName ) Tbl2
ON Tbl1.RowID=Tbl2.RowIDMax
WHERE Tbl2.RowIDMax IS NULL
I mean , tbl2 has the max(rowID) grouped by the custID, custname. How would it then have null values for the rowID?(I assume it works since you have demonstrated, but i think i cannot clearly see the logic)
Do let me know..!!
Jude
March 20, 2008 at 7:28 pm
This is the original query, reformatted for easier reading with the
dirived table.
SELECT
RowID
FROM
dbo.del_duplicates Tbl1
Left Join (SELECT
CustId,
CustName,
MAX(RowID) AS RowIDMax
FROM
dbo.del_duplicates
GROUP BY
CustId,
CustName
) Tbl2
ON
Tbl1.RowID = Tbl2.RowIDMax
WHERE
Tbl2.RowIDMax IS NULL
This select statement will return one row for each customer with its largest RowID number.
SELECT
CustId,
CustName,
MAX(RowID)
FROM
dbo.del_duplicates
GROUP BY
CustId,
CustName
Turning this select statement into a SQL Server 2005 CTE, allows you to then use this result set
as a table in the immediately following select query. This query, using a left outer join joins
original table with the "virtual table" created by the CTE on the RowID's. As there are more records
in the original table and "mny" that do not have a matching record by RowID in the CTE result set, those
records will have null values for CustId, CustName, and RowIDMax from that result set. The final WHERE
clause filters out all the rows where RowIDMax is not null.
;with UniqueRows (
CustId,
CustName,
RowIDMax
) as (
SELECT
CustId,
CustName,
MAX(RowID)
FROM
dbo.del_duplicates
GROUP BY
CustId,
CustName
)
SELECT
RowID
FROM
dbo.del_duplicates dup
left outer join UniqueRows uni
on (dup.RowID = uni.RowIDMax)
WHERE
uni.RowIDMax is null
I hope this helps.
😎
March 21, 2008 at 11:41 am
I see now, i wasn't focusing on the left join, by definition the join returns all null values from the CTE table. All you have to do then is do a join and delete all rows which do not have the null. Nice! I see i have some ways to go :).
I guess i would follow this way. The stored procedure i created earlier, is taking too long to run. I created it to run on 40 million records, however it has run for a good 16 hrs and yet not completed. Do you or anyone think i should forcibly terminate it?
I'm not sure if it has run into a sort of unending loop...
Should i put some breakpoints in the procedure, or insert some looping conditions?
Thanks!
March 21, 2008 at 11:47 am
b4 facing 40 M recs, I'd probably create a small (100k recs) test dataset, run the sp vs it and gather some performance stats. Then proceed further.
U should terminate the sp that runs for 16 h. It should not take that long even for 40 M recs.
March 24, 2008 at 4:26 pm
I rewrote the procedure for removing the duplicate data as suggested above.
Last week, the datawarehouse went into recovery mode when i tried to restart the server.(Reason being the procedure that ran for 16 hrs , was not getting killed, so i had the server restarted.
I then tested the the procedure for removing the duplicates on about 10000 rows of data. It worked fine
DELETE FROM customers_dim_test
Where customerid IN
(
SELECT customerid FROM customers_dim_test ct1 Left Join
(SELECT firstname,lastname, email, MAX(customerid) AS RowIDMax
FROM customers_dim_test
GROUP BY firstname,lastname, email
[Wink] customers_dim_test2
ON ct1.customerid=customers_dim_test2.RowIDMax
WHERE customers_dim_test2.RowIDMax IS NULL
)
I check for more duplicates in the customers_dim_test table using group by on
1]firstname, lastname, email
2]firstname, lastname, socialsecurity
3]firstname, email , socialsecurity
4]firstname, lastname, address
and so on checking for duplicates in firstname, lastname, email, address, ssn
I am running the query now and it has taken 1hr45 mins, do you think i should terminate it, in order to prevent a situation like that happened lastweek.
Thanks!
March 24, 2008 at 4:44 pm
2 hrs is still too long time; u probably consumed all the memory in your box by now.
I think u r still taking too big leaps: i'd try to run the SP vs a dataset in your production dbase [or a copy], say 1M recs, see what happens. I
t's also possible that data is not clean enough: that becomes an issue sometimes.
One way or another you should be able to debug it to the point when u'll know what the issue you are facing.
March 26, 2008 at 10:02 am
thanx sergei and lynnn, that was something new i learnt, if finally did its job and took about 3 hrs to execute!!
thanx again 🙂
Viewing 11 posts - 1 through 10 (of 10 total)
You must be logged in to reply to this topic. Login to reply