November 18, 2011 at 4:13 am
one of the way to delete duplicate record is on sql authority website. I've used it many times but when i looked at the performance of the query it uses lot of CPU and physical IO it seems.
DELETE
FROM MyTable
WHERE ID NOT IN
(
SELECT MAX(ID)
FROM MyTable
GROUP BY DuplicateColumn1, DuplicateColumn2, DuplicateColumn3)
Can you suggest any other way doing this with better performance and less resource taken?
thanks,
Vijay
November 18, 2011 at 4:20 am
you might wanna take a look at this post on duplicates
November 18, 2011 at 4:37 am
dva2007 (11/18/2011)
one of the way to delete duplicate record is on sql authority website. I've used it many times but when i looked at the performance of the query it uses lot of CPU and physical IO it seems.DELETE
FROM MyTable
WHERE ID NOT IN
(
SELECT MAX(ID)
FROM MyTable
GROUP BY DuplicateColumn1, DuplicateColumn2, DuplicateColumn3)
Can you suggest any other way doing this with better performance and less resource taken?
thanks,
Vijay
Test some different methods until you find one you're happy with.
BEGIN TRAN
--Standard TestEnvironment of 1,000,000 rows of random-ish data
IF object_id('tempdb..#testEnvironment') IS NOT NULL
BEGIN
DROP TABLE #testEnvironment
END
IF object_id('tempdb..#testEnvironment2') IS NOT NULL
BEGIN
DROP TABLE #testEnvironment2
END
--1,000,000 Random rows of data
;WITH t1 AS (SELECT 1 N UNION ALL SELECT 1 N),
t2 AS (SELECT 1 N FROM t1 x, t1 y),
t3 AS (SELECT 1 N FROM t2 x, t2 y),
t4 AS (SELECT 1 N FROM t3 x, t3 y),
t5 AS (SELECT 1 N FROM t4 x, t4 y),
tally AS (SELECT TOP 1000000 ROW_NUMBER() OVER (ORDER BY (SELECT NULL)) AS n
FROM t5 x, t5 y)
SELECT n AS ID,
(ABS(CHECKSUM(NEWID())) % 100) + 1 AS randomSmallInt
INTO #testEnvironment
FROM tally
--Duplicate table to keep test fair
SELECT ID, randomSmallInt
INTO #testEnvironment2
FROM #testEnvironment
PRINT '========== Add Indexes =========='
CREATE NONCLUSTERED INDEX [test_index]
ON #testEnvironment (ID)
CREATE NONCLUSTERED INDEX [test_index]
ON #testEnvironment2 (ID)
PRINT REPLICATE('=',80)
PRINT '========== Delete=========='
SET STATISTICS IO ON
DELETE
FROM #testEnvironment
WHERE ID NOT IN (SELECT MAX(ID)
FROM #testEnvironment
GROUP BY randomSmallInt)
SET STATISTICS IO OFF
PRINT REPLICATE('=',80)
PRINT '========== Delete Version 2=========='
SET STATISTICS IO ON
;WITH duplicates AS (
SELECT ROW_NUMBER() OVER (PARTITION BY randomSmallInt ORDER BY ID) AS rn,
ID, randomSmallInt
FROM #testEnvironment2)
DELETE FROM duplicates
WHERE rn > 1
SET STATISTICS IO OFF
PRINT REPLICATE('=',80)
ROLLBACK
(1000000 row(s) affected)
(1000000 row(s) affected)
========== Add Indexes ==========
================================================================================
========== Delete==========
Table 'Worktable'. Scan count 4, logical reads 2403280, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
Table '#testEnvironment____________________________________________________________________________________________________000000000071'. Scan count 18, logical reads 4111411, physical reads 0, read-ahead reads 12, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
(999900 row(s) affected)
================================================================================
========== Delete Version 2==========
Table '#testEnvironment2___________________________________________________________________________________________________000000000072'. Scan count 5, logical reads 4088033, physical reads 0, read-ahead reads 12, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
(999900 row(s) affected)
================================================================================
November 18, 2011 at 9:03 am
Try out this....
DELETE FROM TABLE1 WHERE ID IN(
SELECT COUNT(ID) AS ID
FROM TABLE1
GROUP BY ID
HAVING (COUNT(ID)>1))
I have one dount that you want to delete all the records that are duplicat or u want to delete all the reocor except one if there are duplicate records.
Thanks & Regards
Syed Sami Ur Rehman
SQL-Server (Developer)
Hyderabad
Email-sami.sqldba@gmail.com
November 21, 2011 at 1:00 am
CREATE TABLE T(Date_Stamp DATETIME,KW1 DECIMAL(5,1), KW2 DECIMAL(5,1))
INSERT INTO T
SELECT '12/10/2010', 5.3, 3.1 UNION ALL
SELECT '12/10/2010', 5.3, 3.1 UNION ALL
SELECT '12/9/2010', 4, 2 UNION ALL
SELECT '12/8/2010', 3, 1 UNION ALL
SELECT '12/7/2010', 7.4, 5 UNION ALL
SELECT '12/7/2010', 7.4, 5 UNION ALL
SELECT '12/7/2010', 7.4, 5
;with cte
as (select row_number() over(partition by Date_Stamp,KW1,KW2 order by Date_Stamp) as rn,
KW1,KW2,Date_Stamp
from T)
SELECT * FROM cte
Result):
rn KW1 KW2 Date_Stamp
1 7.4 5.0 2010-12-07 00:00:00.000
2 7.4 5.0 2010-12-07 00:00:00.000
3 7.4 5.0 2010-12-07 00:00:00.000
1 3.0 1.0 2010-12-08 00:00:00.000
1 4.0 2.0 2010-12-09 00:00:00.000
1 5.3 3.1 2010-12-10 00:00:00.000
2 5.3 3.1 2010-12-10 00:00:00.000
to delete duplicates change "SELECT * FROM cte" to DELETE FROM cte WHERE rn > 1
More about CTE on the link below.
http://msdn.microsoft.com/en-us/library/ms190766.aspx
kudos to bitbucket-25253 for the code.
cheers 😀
===============================================================
"lets do amazing" our company motto..
November 21, 2011 at 2:32 am
Hi jnuqui
Good Work Dude.
Thanks & Regards
Syed Sami Ur Rehman
SQL-Server (Developer)
Hyderabad
Email-sami.sqldba@gmail.com
November 21, 2011 at 3:04 pm
In terms of which technique will perform the best; it really depends on the actual table definition and indexing. Obviously what you originally intended to be an ID column is not a primary key or else you wouldn't have duplicates now. However, rather than using MAX(ID), I'd suggest ROW_NUMBER(), so you have have more control over which rows get deleted.
declare @mytable table
(id int not null, insertdate datetime not null);
insert @mytable values ( 1, '2011-11-01' );
insert @mytable values ( 1, '2011-11-02' );
insert @mytable values ( 1, '2011-11-02' );
insert @mytable values ( 2, '2011-11-01' );
insert @mytable values ( 3, '2011-11-01' );
select *,
row_number() over (partition by id order by insertdate desc) dup_id
from @mytable;
id insertdate dup_id
--- ---------- -------
1 2011-11-02 1
1 2011-11-02 2
1 2011-11-01 3
2 2011-11-01 1
3 2011-11-01 1
If this is a staging table containing non-cleansed data, then consider making id + dup_id your primary key here, don't delete anything from it, and insert rows where dup_id = 1 to your clean transactional table.
"Do not seek to follow in the footsteps of the wise. Instead, seek what they sought." - Matsuo Basho
November 21, 2011 at 7:36 pm
Greetings,
It depends on 2 things:
1. Do you want to delete all the rows or just leave one of the duplicates in there?
2. If you just want to delete one, you have to come up with some logic to select the right rows to delete. What logic would you like to use? For example, you may want to delete the row that was entered in last. In coming up with this logic, you want to see if the rows are complete duplicates of each other (all the columns have the same data) or if there are some columns (such as identity columns) which are different. If they are complete duplicates then you can move them all to a separate table and then do a SELECT DISTINCT on all the columns to insert just a single row back to the original table. If they are not complete duplicates then you can maybe use the columns with differences to come up with the logic for the rows that you want to delete.
When you answer both of those questions I can give you a more specific suggestion.
Thanks,
SB
November 25, 2011 at 5:49 am
Hi,
You can try below solution
delete T1
from MyTable T1, MyTable T2
where T1.dupField = T2.dupField
and T1.uniqueField > T2.uniqueField
November 25, 2011 at 7:22 am
Since this post seems to not want to die, here's an update of the test script I posted before to include the unique solutions posted since.
BEGIN TRAN
--1,000,000 Random rows of data
;WITH t1 AS (SELECT 1 N UNION ALL SELECT 1 N),
t2 AS (SELECT 1 N FROM t1 x, t1 y),
t3 AS (SELECT 1 N FROM t2 x, t2 y),
t4 AS (SELECT 1 N FROM t3 x, t3 y),
t5 AS (SELECT 1 N FROM t4 x, t4 y),
tally AS (SELECT TOP 1000000 ROW_NUMBER() OVER (ORDER BY (SELECT NULL)) AS n
FROM t5 x, t5 y)
SELECT n AS ID,
(ABS(CHECKSUM(NEWID())) % 100) + 1 AS randomSmallInt
INTO #testEnvironment
FROM tally
--Duplicate tables to keep test fair
SELECT ID, randomSmallInt
INTO #testEnvironment2
FROM #testEnvironment
SELECT ID, randomSmallInt
INTO #testEnvironment3
FROM #testEnvironment
SELECT ID, randomSmallInt
INTO #testEnvironment4
FROM #testEnvironment
PRINT '========== Add Indexes =========='
CREATE NONCLUSTERED INDEX [test_index]
ON #testEnvironment (ID)
CREATE NONCLUSTERED INDEX [test_index]
ON #testEnvironment2 (ID)
CREATE NONCLUSTERED INDEX [test_index]
ON #testEnvironment3 (ID)
CREATE NONCLUSTERED INDEX [test_index]
ON #testEnvironment4 (ID)
PRINT REPLICATE('=',80)
PRINT '========== Delete Version 1=========='
SET STATISTICS IO ON
DELETE
FROM #testEnvironment
WHERE ID NOT IN (SELECT MAX(ID)
FROM #testEnvironment
GROUP BY randomSmallInt)
SET STATISTICS IO OFF
PRINT REPLICATE('=',80)
PRINT '========== Delete Version 2=========='
SET STATISTICS IO ON
;WITH duplicates AS (
SELECT ROW_NUMBER() OVER (PARTITION BY randomSmallInt ORDER BY ID) AS rn,
ID, randomSmallInt
FROM #testEnvironment2)
DELETE FROM duplicates
WHERE rn > 1
SET STATISTICS IO OFF
PRINT REPLICATE('=',80)
PRINT '========== Delete Version 3=========='
SET STATISTICS IO ON
DELETE FROM #testEnvironment3
WHERE ID IN(SELECT COUNT(ID) AS ID
FROM #testEnvironment3
GROUP BY ID
HAVING (COUNT(ID)>1))
SET STATISTICS IO OFF
PRINT REPLICATE('=',80)
PRINT '========== Delete Version 4=========='
SET STATISTICS IO ON
DELETE T1
FROM #testEnvironment4 T1, #testEnvironment4 T2
WHERE T1.randomSmallInt = T2.randomSmallInt
AND T1.ID > T2.ID
SET STATISTICS IO OFF
PRINT REPLICATE('=',80)
ROLLBACK
Records Deleted by each version -
Version 1: 999900
Version 2: 999900
Version 3: 0
Version 4: 999900
Note that Version 3 as described does not work.
(1000000 row(s) affected)
Table '#testEnvironment____________________________________________________________________________________________________000000000016'. Scan count 1, logical reads 2599, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
(1000000 row(s) affected)
Table '#testEnvironment____________________________________________________________________________________________________000000000016'. Scan count 1, logical reads 2599, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
(1000000 row(s) affected)
Table '#testEnvironment____________________________________________________________________________________________________000000000016'. Scan count 1, logical reads 2599, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
(1000000 row(s) affected)
========== Add Indexes ==========
Table '#testEnvironment____________________________________________________________________________________________________000000000016'. Scan count 5, logical reads 2599, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
Table '#testEnvironment2___________________________________________________________________________________________________000000000017'. Scan count 5, logical reads 2599, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
Table '#testEnvironment3___________________________________________________________________________________________________000000000018'. Scan count 5, logical reads 2599, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
Table '#testEnvironment4___________________________________________________________________________________________________000000000019'. Scan count 5, logical reads 2599, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
================================================================================
========== Delete Version 1==========
Table 'Worktable'. Scan count 4, logical reads 2403280, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
Table '#testEnvironment____________________________________________________________________________________________________000000000016'. Scan count 18, logical reads 4111411, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
(999900 row(s) affected)
================================================================================
========== Delete Version 2==========
Table '#testEnvironment2___________________________________________________________________________________________________000000000017'. Scan count 5, logical reads 4088033, physical reads 0, read-ahead reads 4, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
(999900 row(s) affected)
================================================================================
========== Delete Version 3==========
Table '#testEnvironment3___________________________________________________________________________________________________000000000018'. Scan count 1, logical reads 2734, physical reads 0, read-ahead reads 4, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
Table 'Worktable'. Scan count 0, logical reads 0, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
(0 row(s) affected)
================================================================================
========== Delete Version 4==========
Table '#testEnvironment4___________________________________________________________________________________________________000000000019'. Scan count 10, logical reads 4090632, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
Table 'Worktable'. Scan count 4, logical reads 2612485, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.
(999900 row(s) affected)
================================================================================
November 25, 2011 at 6:13 pm
you can use common table expressions as well
some thing like
; with temptable
as
( select col, col2, row_number() over
( partition by col1,col2, order by col1,col2 ) as row_num from yourtable )
delete from temptable where row_num > 1
November 27, 2011 at 12:03 am
delete from talbe where id not in (select max(column_id) from talbe group by column_name)
November 27, 2011 at 4:23 am
Try this for better performance
WITH C AS
(
SELECT *, ROW_NUMBER() OVER(PARTITION BY ID ORDER BY (SELECT 0)) AS rn
FROM MyTable
)
DELETE FROM C
WHERE rn>1
November 27, 2011 at 8:42 am
Cadavre (11/25/2011)
Since this post seems to not want to die, here's an update of the test script I posted before to include the unique solutions posted since.
Heh... apparently reading previous posts and suggestions, never mind testing for functionality (some of the code wipes out all dupes without leaving 1 instance behind) and performance, isn't a part of what most people do. 😉
Thanks for doing the testing, Cadavre.
--Jeff Moden
Change is inevitable... Change for the better is not.
November 27, 2011 at 8:45 am
dastagiri16 (11/27/2011)
delete from talbe where id not in (select max(column_id) from talbe group by column_name)
Except for the missing columns, that's identical to the original problem-code. 😉
--Jeff Moden
Change is inevitable... Change for the better is not.
Viewing 15 posts - 1 through 15 (of 15 total)
You must be logged in to reply to this topic. Login to reply