February 16, 2010 at 9:15 am
Hi People,
I wonder if someone might offer some advice / assistance with the following problem I have.
I have a very large dataset that contains records that need to be merged, and the original records removed.
For example, out of, say, 100 records, there could be 40 "groups" of duplicates. Some groups contain 3 duplicates, others 2, and some records are not duplicated at all.
In some cases I would like to keep the value that was last updated, in others the first value (ie the lowest ID) needs to be kept. These need to create a new record for the "Master" of each group.
create table #People(
id int,
First nvarchar(50),
Last nvarchar (50),
Email nvarchar (50),
Likes_Fish bit,
Likes_Meat bit,
Drinks_Alcohol bit,
Gender nvarchar(1),
Account_Number nvarchar (50),
Account_Expiry datetime,
Group_num int,
Master_id int,
Record_modified datetime)
Insert into #People (id, first, last, email, likes_fish, likes_meat,drinks_alcohol,
gender, account_number, account_expiry, group_num,master_id,record_modified)
select 1,'Shris','Barnard','1.chrisbarnard@email.com',0,1,1,'M',4127,10/11/2010,1,1,16/08/2009
union all select 2,'Chris','Barnard','1.chrisbarnard@email.com',0,1,1,'M',4127,10/11/2010,1,1,16/08/2007
union all select 3,'Kim','Mi Kyoung','1004-1123@email.com',0,1,1,'F','',01/01/1900,2,3,30/03/2009
union all select 4,'.','Mi Kyoung','1004-1123@email.com',0,0,1,'F',7390,27/06/2009,2,3,17/09/2007
union all select 5,'Joan','Tan','123@email.com',0,0,0,'F','',01/01/1900,3,5,16/08/2009
union all select 6,'Joan','Tan','123@email.com',1,0,0,'F',6004,13/11/2010,3,5,14/06/2009
union all select 7,'Kim','Croucher','12345@email.com',1,1,0,'F',545,27/01/2009,4,7,18/01/2009
union all select 8,'Kim','Croucher','12345@email.com',1,1,0,'F',576,13/11/2010,4,7,19/06/2009
union all select 9,'Stephen','ONeill','166shanid@email.com',0,0,0,'M','',01/01/1900,5,9,19/06/2009
union all select 10,'Mary','ONeill','166shanid@email.com',0,0,0,'F',1930,24/01/2009,5,9,04/03/2009
union all select 11,'Mary','Murray','60436@email.com',0,1,0,'F','',01/01/1900,6,11,08/07/2008
union all select 12,'Mary','Murray','60436@email.com',0,1,0,'F','',01/01/1900,6,11,17/05/2008
union all select 13,'William','Wong','9am6pm@email.com',1,1,0,'M',2756,07/06/2009,7,13,07/11/2007
union all select 14,'W','Wong','9am6pm@email.com',1,1,0,'M','',01/01/1900,7,13,08/01/2010
union all select 15,'Margaret','Buick','a.buick@email.com',1,0,1,'F',2673,30/06/2009,8,15,20/11/2009
union all select 16,'Margaret','Buick','a.buick@email.com',0,0,0,'F',6750,07/06/2009,8,15,16/11/2009
union all select 17,'Anna','Corbett','a.corbett@email.com',0,1,0,'F','',01/01/1900,9,17,01/09/2009
union all select 18,'Anna','Corbett','a.corbett@email.com',0,1,0,'F',9069,24/04/2010,9,17,26/07/2009
union all select 19,'Abby','Allan','a.crammer@email.com',1,1,0,'F','',01/01/1900,10,19,22/02/2009
union all select 20,'Abby','Allen','a.crammer@email.com',1,1,0,'F','',01/01/1900,10,19,16/02/2009
union all select 21,'Nicola','Thompson','a.haworth@email.com',1,0,1,'F','',01/01/1900,11,21,28/01/2009
union all select 22,'Andrew','Haworth','a.haworth@email.com',0,1,1,'M',9979,26/12/2009,11,21,02/12/2008
union all select 23,'Amanda','Tinker','a.j.tinker@email.com',0,0,0,'M','',01/01/1900,12,23,07/11/2008
union all select 24,'Amanda','Tinker','a.j.tinker@email.com',0,0,1,'F',9046,11/11/2009,12,23,22/10/2008
union all select 25,'Nicholas','Hall','a.j.tinker@email.com',0,0,0,'F',643,25/04/2009,12,23,12/10/2008
union all select 26,'Ambika','Kirupakumaran','a.kirupakumaran@email.com',0,0,0,'F','',01/01/1900,13,26,29/09/2008
union all select 27,'A','Kirupakumaran','a.kirupakumaran@email.com',0,0,0,'F','',01/01/1900,13,26,07/06/2007
union all select 28,'Ambika','Kirupakumaran','a.kirupakumaran@email.com',0,0,0,'F','',15/04/2009,13,26,07/06/2007
union all select 29,'Alex','Milton','a.milton@email.com',0,0,0,'M',1123,01/01/1900,14,28,09/11/2009
union all select 30,'Alexander','Milton','a.milton@email.com',0,0,0,'M','',01/01/1900,14,28,07/06/2007
union all select 31,'Cheryl','Davis','a.rudge@email.com',0,0,0,'F',3229,27/11/2009,15,31,28/12/2008
union all select 32,'Anthony','Rudge','a.rudge@email.com',0,0,1,'M','',01/01/1900,15,31,20/07/2009
union all select 33,'Abbas','Sharif','abbas.sharif@email.com',0,0,1,'M',3073,07/06/2009,16,33,23/08/2009
union all select 34,'Abbas','Sharif','abbas.sharif@email.com',0,0,1,'M','',01/01/1900,16,33,13/08/2008
union all select 35,'Anne','Barrett','abexm@email.com',0,0,0,'F','',01/01/1900,17,35,12/08/2008
union all select 36,'Anne','Barrett','abexm@email.com',0,0,1,'F','',01/01/1900,17,35,26/08/2009
union all select 37,'Anne','Barrett','abexm@email.com',0,0,1,'F','',01/01/1900,17,35,07/06/2007
union all select 38,'Leslie','Abram','abram@email.com',0,0,1,'F','',01/01/1900,18,38,07/06/2007
union all select 39,'Leslie','Abram','abram@email.com',0,0,0,'F','',01/01/1900,18,38,31/12/2008
union all select 40,'Carol','Walker','ac.ca@email.com',0,0,1,'F','',01/01/1900,19,40,30/11/2007
union all select 41,'Alan','Walker','ac.ca@email.com',0,0,0,'M','',01/01/1900,19,40,31/10/2009
union all select 42,'Donna','Mannion','accounts@email.com',0,0,0,'F','',01/01/1900,20,42,17/11/2007
union all select 43,'Donna','Mannion','accounts@email.com',0,0,1,'F',9845,07/06/2009,20,42,21/04/2009
union all select 44,'Donn','Mannion','accounts@email.com',0,0,0,'F',405,07/06/2009,20,42,07/01/2009
union all select 45,'Drew','Munnoch','acmx21@email.com',0,0,1,'M',3425,04/04/2009,21,45,12/04/2008
union all select 46,'Jackie','Munnoch','acmx21@email.com',0,0,1,'F','',01/01/1900,21,45,07/06/2007
union all select 47,'Astrid','Cox','acox05@email.com',0,0,1,'F',9972,11/01/2010,22,47,30/01/2009
union all select 48,'Astrid','Cox','acox05@email.com',0,0,1,'F','',01/01/1900,22,47,07/06/2007
union all select 49,'Adam','Sivil','adam.sivil@email.com',0,0,1,'M',1356,07/06/2009,23,49,03/05/2009
union all select 50,'Adam','Sivil','adam.sivil@email.com',0,0,1,'M','',01/01/1900,23,49,07/06/2007
union all select 51,'Adam','Kennaugh','adam@email.com',0,0,1,'M','',01/01/1900,24,51,28/03/2009
union all select 52,'Adam C E','Kennaugh','adam@email.com',0,0,0,'m',7698,02/01/2010,24,51,29/11/2009
union all select 53,'Adamos','Georgiou','adamos.georgiou@email.com',0,0,1,'U','',01/01/1900,25,53,15/07/2009
union all select 54,'.','Georgiou','adamos.georgiou@email.com',0,0,1,'m',9906,07/06/2009,25,53,13/01/2010
union all select 55,'Adele','Hulbert','adele1@email.com',0,0,1,'F','',01/01/1900,26,55,07/06/2007
union all select 56,'Adele','Hulbert','adele1@email.com',0,0,0,'F','',01/01/1900,26,55,07/06/2007
union all select 57,'Eco','Burke','admin@email.com',0,0,1,'F',4137,11/09/2009,27,57,13/01/2010
union all select 58,'Rachel','Burke','admin@email.com',0,0,1,'F','',01/01/1900,27,57,09/05/2008
union all select 59,'Karen','LEnfant','brian.lenfant@email.com',0,0,1,'F',8889,20/08/2009,28,59,07/06/2007
union all select 60,'brian','LEnfant','brian.lenfant@email.com',0,0,1,'M','',01/01/1900,28,59,02/04/2009
union all select 61,'brian','Valve','brian.Valve@email.com',0,0,1,'M',8020,02/07/2009,29,61,11/01/2008
union all select 62,'brian','Valve','brian.Valve@email.com',0,0,1,'M','',01/10/2009,29,61,15/02/2009
union all select 63,'Anne','Carlile','aecarlile@email.com',0,0,0,'F','',01/01/1900,30,63,06/10/2008
union all select 64,'Anne','Carlile','aecarlile@email.com',0,0,1,'F',7261,08/01/2010,30,63,06/10/2008
union all select 65,'Ann','Sim','aesimuk@email.com',0,0,1,'F','',01/01/1900,31,65,14/10/2009
union all select 66,'Ann','Sim','aesimuk@email.com',0,0,1,'F',6758,11/09/2009,31,65,07/06/2007
union all select 67,'Roy','Sim','aesimuk@email.com',0,0,0,'M',781,17/12/2010,31,65,10/12/2008
union all select 68,'Anne','Gray','ag@email.com',0,0,0,'F',1804,06/12/2010,32,68,24/11/2007
union all select 69,'Anne','Gray','ag@email.com',0,0,1,'F','',01/01/1900,32,68,27/11/2008
union all select 70,'Agata','Stachura','agata@email.com',0,0,0,'F','',01/01/1900,33,70,07/06/2007
union all select 71,'Agata','Stachura','agata@email.com',0,0,1,'F','',01/01/1900,33,70,02/12/2008
union all select 72,'Agata','Prazmowska','agatrice@email.com',0,0,1,'F','',01/01/1900,34,72,20/06/2009
union all select 73,'Agata','Prazmowska','agatrice@email.com',0,0,1,'F',7319,04/01/2009,34,72,09/07/2008
union all select 74,'Alison','Gray','agray@email.com',0,0,1,'F',7948,24/03/2009,35,74,08/10/2009
union all select 75,'Alison','Gray','agray@email.com',0,0,1,'F','',01/01/1900,35,74,07/06/2007
union all select 76,'Amanda','Collings','ahcollings-crafter@email.com',0,0,1,'F',5783,29/07/2010,36,76,07/06/2007
union all select 77,'Amanda Helen','Collings','ahcollings-crafter@email.com',0,0,1,'F',9408,21/06/2010,36,76,09/11/2009
union all select 78,'Aideen','Mullen','aidoshack@email.com',0,0,1,'F','',01/01/1900,37,78,09/03/2009
union all select 79,'Aideen','Mullen','aidoshack@email.com',0,0,0,'F','',01/01/1900,37,78,03/12/2009
union all select 80,'Ailsa','Billington','ailsa@email.com',0,0,1,'F',1802,11/04/2009,38,80,26/06/2009
union all select 81,'Ailsa','Billington','ailsa@email.com',0,0,1,'F','',01/01/1900,38,80,01/04/2009
union all select 82,'Ailsa','Dumble','ailsadumble@email.com',0,0,1,'F','',01/01/1900,39,82,22/11/2008
union all select 83,'Ailsa','Dumble','ailsadumble@email.com',0,0,1,'F',6853,04/07/2010,39,82,19/06/2008
union all select 84,'Ailsa','Kapadia','ailsakapadia@email.com',0,0,0,'F',4917,06/05/2009,40,84,09/05/2008
union all select 85,'Ailsa','Kapadia','ailsakapadia@email.com',0,0,1,'F','',01/01/1900,40,84,14/04/2008
union all select 86,'','Sosaar','airi@email.com',0,0,0,'F','',01/01/1900,41,86,07/06/2007
union all select 87,'Airi','Sosaar','airi@email.com',0,0,1,'F',6813,07/06/2009,41,86,24/03/2009
union all select 88,'White','Aisling','aisling@email.com',0,0,0,'F','',01/01/1900,42,88,30/12/2009
union all select 89,'Aisling','White','aisling@email.com',0,0,0,'F',499,07/06/2009,42,88,06/10/2009
union all select 90,'Graham','Campbell','aislingandgraham@email.com',0,0,0,'M','',01/01/1900,43,90,14/02/2009
union all select 91,'Graham','Campell','aislingandgraham@email.com',0,0,0,'M','',01/01/1900,43,90,23/09/2009
union all select 92,'Aisling','Cullen','aislingc@email.com',0,0,1,'F',1100,15/11/2010,44,92,11/01/2008
union all select 93,'Aisling','Cullen','aislingc@email.com',0,0,0,'F','',01/01/1900,44,92,07/06/2007
union all select 94,'Aisling','Cullen','aislingc78@email.com',0,0,1,'F','',01/01/1900,45,94,16/10/2009
union all select 95,'A','Cullin','aislingc78@email.com',0,0,1,'F',3194,31/05/2009,45,94,09/09/2008
union all select 96,'Aisling','O Brien','aislingob@email.com',0,0,1,'F','',01/01/1900,46,96,21/08/2007
union all select 97,'Aisling','Obrien','aislingob@email.com',0,0,1,'F',6628,02/10/2010,46,96,19/11/2009
union all select 98,'Alison','Davies','ajd@email.com',0,0,1,'F','',01/01/1900,47,98,27/08/2009
union all select 99,'Alison','Davies','ajd@email.com',0,0,1,'F',419,07/06/2009,47,98,09/03/2009
union all select 100,'Amanda','Marshall','ajmgsm@email.com',0,0,1,'F',4247,24/01/2009,48,100,21/08/2007
I would like one record per "Master_id" with the following:
1) The First, and Last that are both present from the lowest of the IDs.
2) Keep the "1" for Likes fish, meat and alcohol. ie if there is a "1" for any of the group of records, I want to keep it.
3) Keep the Gender if the is an M or F. Discard and "U"s
4) Keep the Account Number and Expiry for the record with the latest expiry date
I will post what I have tried so far shortly!
Thanks,
Paul
February 16, 2010 at 10:39 am
I'd be looking at some data entry to fix the dataset.
February 16, 2010 at 10:42 am
Of course, if you have many thousands of records, you'd need to define some more detailed rules.
March 16, 2010 at 7:17 am
Hi again,
Just to get back to this issue. Yes, going forward there are going to be massive changes to the data entry.. with a complete application redesign. However I do need to clean up the existing data. I have now finalised a detailed set of rules, but to summarise, I would like to know how to pull out particular column values per "group" of duplicates.
For example, how can I get the First_Name from the record from each group with the highest ID, unless this is null, in which case the next highest id(and so on)?
How can I select a field or fields by being from the record with the highest id?
And how can I keep Ts in the T or F field. (I think I have this one.. select max(likes_fish) from Table1 group by email)
I wish to build a single master record for each group of duplicates, and have just the three rule types:
1) Field value from highest Id in group, unless NULL, then next highest
2) Series of field values from highest Id in group
3) Max of the T / F fields.
Can someone give me a pointer please? I am dealing with in excess of a million records.
Paul
March 16, 2010 at 8:19 am
The insert scripts need single-quotes around the dates, otherwise it treats them as math problems. "08/01/2010" = 8 divided by 1 divided by 2010 = 0 (because it's integer math by default). Which results in all the dates becoming 1 Jan 1900, because that's 0 in datetime storage.
- Gus "GSquared", RSVP, OODA, MAP, NMVP, FAQ, SAT, SQL, DNA, RNA, UOI, IOU, AM, PM, AD, BC, BCE, USA, UN, CF, ROFL, LOL, ETC
Property of The Thread
"Nobody knows the age of the human race, but everyone agrees it's old enough to know better." - Anon
March 16, 2010 at 8:35 am
Here's a query that seems to do what you need. Try it, let me know if it works.
Performance on it will be a problem, but I can't fine tune that. You'll need to look at various method for getting this kind of data from the table and work out which variation will be best for your data. Without access to the actual table, I can't do that.
;
WITH CTE
AS (SELECT master_id,
MIN(ID) AS first_id,
MAX(Account_Expiry) AS latest_expiry
FROM #People
GROUP BY master_id)
SELECT P1.master_id,
P1.[First],
P1.[Last],
CASE WHEN EXISTS ( SELECT 1
FROM #People P3
WHERE master_id = P1.master_id
AND Likes_fish = 1 ) THEN 1
ELSE 0
END AS Likes_Fish,
CASE WHEN EXISTS ( SELECT 1
FROM #People P3
WHERE master_id = P1.master_id
AND Likes_meat = 1 ) THEN 1
ELSE 0
END AS Likes_Meat,
CASE WHEN EXISTS ( SELECT 1
FROM #People P3
WHERE master_id = P1.master_id
AND Drinks_Alcohol = 1 ) THEN 1
ELSE 0
END AS Drinks_Alcohol,
(SELECT TOP 1
Gender
FROM #People P6
WHERE Master_id = P1.Master_id
AND Gender IN (N'M', N'F')) AS Gender,
CTE.latest_expiry,
(SELECT TOP 1
Account_Number
FROM #People P7
WHERE master_id = P1.Master_id
AND Account_Expiry = CTE.latest_expiry) AS Account_Number
FROM #People P1
INNER JOIN CTE
ON P1.master_id = CTE.master_id
AND P1.ID = CTE.first_id;
- Gus "GSquared", RSVP, OODA, MAP, NMVP, FAQ, SAT, SQL, DNA, RNA, UOI, IOU, AM, PM, AD, BC, BCE, USA, UN, CF, ROFL, LOL, ETC
Property of The Thread
"Nobody knows the age of the human race, but everyone agrees it's old enough to know better." - Anon
March 16, 2010 at 8:35 am
I am sorry about that
March 16, 2010 at 8:41 am
Thank you G-Squared.
Woah! That is far more complicated than my SQL knowledge allows. I will need to do a crash course in how this works. I fI read up on "with CTE" will this help me to understand the script?
What I can confirm is that this provided me with the records that I want!!
March 17, 2010 at 6:25 am
Look up "Common Table Expression" in Books Online or on MSDN.
- Gus "GSquared", RSVP, OODA, MAP, NMVP, FAQ, SAT, SQL, DNA, RNA, UOI, IOU, AM, PM, AD, BC, BCE, USA, UN, CF, ROFL, LOL, ETC
Property of The Thread
"Nobody knows the age of the human race, but everyone agrees it's old enough to know better." - Anon
March 17, 2010 at 6:27 am
I will do that.
Thanks for your valuable assistance.
March 17, 2010 at 6:50 am
You're welcome.
- Gus "GSquared", RSVP, OODA, MAP, NMVP, FAQ, SAT, SQL, DNA, RNA, UOI, IOU, AM, PM, AD, BC, BCE, USA, UN, CF, ROFL, LOL, ETC
Property of The Thread
"Nobody knows the age of the human race, but everyone agrees it's old enough to know better." - Anon
Viewing 11 posts - 1 through 10 (of 10 total)
You must be logged in to reply to this topic. Login to reply