Force similar strings to be the same

  • Hello everyone:

    I'm working on a sql script that attempts to unify inconsistent spelling of names to one spelling. For example, "O CONNOR, BOB" vs. "OCONNOR, BOB" are similar but not identical-- therefore I need to create a name that can be applied to both.

    Below is a script I've written to address this, and although it seems like it should meet my needs (based on the test script), it fails to work quickly on a table of 100,000 rows (in fact the script has been running for 24+ hours...)

    Ultimately, I need to end up with a crosswalk table that lists a person's ID, their raw spelling of their names, and a column that is the cleaned-up version of their names stamped on all applicable rows.

    Many thanks in advance for your help!

    --Peter

    /********************************/

    CREATE TABLE DBO.TEST_NAMES(

    NAME_ID INT IDENTITY(1,1),

    NAME_DESC VARCHAR(50),

    NAME_DESC_REVISED VARCHAR(50))

    INSERT DBO.TEST_NAMES(NAME_DESC)

    SELECT 'BROWN, JOE'

    INSERT DBO.TEST_NAMES(NAME_DESC)

    SELECT 'BROWN, JOE F'

    INSERT DBO.TEST_NAMES(NAME_DESC)

    SELECT 'BROWN, JOE FRANK'

    /* ADD A COLUMN TO ASSIST IN SORTING THE NAMES; REMOVE COLUMN AT END OF SCRIPT */

    ALTER TABLE DBO.TEST_NAMES

    ADD INTERIM_DESC VARCHAR(50)

    GO

    /**/

    UPDATETEST_NAMES

    SET INTERIM_DESC=CASE

    WHEN NAME_DESC_REVISED IS NOT NULL

    AND NAME_DESC_REVISED<>''

    THEN NAME_DESC_REVISED

    ELSE NAME_DESC

    END

    UPDATETEST_NAMES

    SETNAME_DESC_REVISED = NAME_DESC

    WHERENAME_DESC_REVISED IS NULL OR NAME_DESC_REVISED = ''

    PRINT '*** ORIGINAL LIST ***'

    SELECT * FROM DBO.TEST_NAMES

    /**/

    IF (SELECTCOUNT(*)

    FROMTEST_NAMES A

    JOIN

    TEST_NAMES B

    ON A.NAME_DESC_REVISED<> B.NAME_DESC_REVISED

    AND A.NAME_DESC_REVISED LIKE B.NAME_DESC_REVISED+'%')>0

    BEGIN

    IF OBJECT_ID('TEMPDB..#TEMP') IS NOT NULL

    DROP TABLE #TEMP

    SELECTROW_ID=A.NAME_ID,

    A.NAME_ID,

    NAME_DESC=A.INTERIM_DESC,

    BNAME_ID=B.NAME_ID,

    BNAME_DESC=B.INTERIM_DESC

    INTO#TEMP

    FROMTEST_NAMES A

    JOIN

    TEST_NAMES B

    ON A.INTERIM_DESC<> B.INTERIM_DESC

    AND REPLACE(REPLACE(REPLACE(REPLACE(A.INTERIM_DESC,'-',''),' ',''),'''',''),'.','')

    LIKE REPLACE(REPLACE(REPLACE(REPLACE(B.INTERIM_DESC,'-',''),' ',''),'''',''),'.','') +'%'

    PRINT '*** LIST OF SIMILAR NAMES ***'

    SELECT * FROM #TEMP

    IF OBJECT_ID('TEMPDB..#ROLLUP') IS NOT NULL

    DROP TABLE #ROLLUP

    SELECTROW_ID,

    NAME_ID,

    NAME_DESC

    INTO#ROLLUP

    FROM#TEMP

    UNION

    SELECTROW_ID,

    BNAME_ID,

    BNAME_DESC

    FROM#TEMP

    IF OBJECT_ID('TEMPDB..#ROLLUP2') IS NOT NULL

    DROP TABLE #ROLLUP2

    SELECTROW_ID,

    NAME_ID,

    NAME_DESC,

    NAME_DESC_REV =(

    SELECTMAX(B.NAME_DESC)

    FROM#ROLLUP B

    WHEREB.ROW_ID = A.ROW_ID

    )

    INTO #ROLLUP2

    FROM#ROLLUP A

    UPDATETEST_NAMES

    SETINTERIM_DESC = REPLACE(REPLACE(R.NAME_DESC_REV,'''',''),'.','') /* STRIP SINGLE QUOTES AND PERIODS FROM MAX NAMES*/

    FROMTEST_NAMES T

    JOIN

    #ROLLUP2 R

    ON T.NAME_ID = R.NAME_ID

    UPDATETEST_NAMES

    SETNAME_DESC_REVISED = INTERIM_DESC

    WHERENAME_DESC_REVISED<>INTERIM_DESC

    END

    PRINT''

    PRINT '**** FINAL LIST OF NAMES ****'

    ALTER TABLE DBO.TEST_NAMES

    DROP COLUMN INTERIM_DESC

    GO

    SELECT * FROM DBO.TEST_NAMES

    ORDER BY NAME_DESC_REVISED

    /* END OF SCRIPT*/

  • Hi Peterzeke,

    Hope this is what you are looking for.

    CREATE TABLE DBO.TEST_NAMES(

    NAME_ID INT IDENTITY(1,1),

    NAME_DESC VARCHAR(50),

    NAME_DESC_REVISED VARCHAR(50))

    INSERT DBO.TEST_NAMES(NAME_DESC)

    SELECT 'BROWN, JOE'

    INSERT DBO.TEST_NAMES(NAME_DESC)

    SELECT 'BROWN, JOE F'

    INSERT DBO.TEST_NAMES(NAME_DESC)

    SELECT 'BROWN, JOE FRANK'

    INSERT DBO.TEST_NAMES(NAME_DESC)

    SELECT 'Oakley, Glen'

    INSERT DBO.TEST_NAMES(NAME_DESC)

    SELECT 'Oakley, Blen Dark'

    INSERT DBO.TEST_NAMES(NAME_DESC)

    SELECT 'Russel, Barrat'

    /* ADD A COLUMN TO ASSIST IN SORTING THE NAMES; REMOVE COLUMN AT END OF SCRIPT */

    ALTER TABLE DBO.TEST_NAMES

    ADD INTERIM_DESC VARCHAR(50)

    GO

    /* */

    UPDATE TEST_NAMES

    SET INTERIM_DESC= SOUNDEX(Name_Desc)

    DECLARE @Sndx VARCHAR(15), @MaxLen INT, @Name_Desc VARCHAR(100)

    DECLARE CUR CURSOR FOR SELECT DISTINCT INTERIM_DESC FROM TEST_NAMES

    OPEN CUR

    FETCH NEXT FROM CUR INTO @Sndx

    WHILE @@FETCH_STATUS = 0

    BEGIN

    SELECT @MaxLen = MAX(LEN(NAME_DESC)) FROM TEST_NAMES WHERE INTERIM_DESC = @Sndx

    SELECT @Name_Desc = NAME_DESC FROM TEST_NAMES WHERE INTERIM_DESC = @Sndx AND LEN(NAME_DESC) = @MaxLen

    UPDATE TEST_NAMES

    SET NAME_DESC_REVISED = @Name_Desc

    WHERE INTERIM_DESC = @Sndx

    FETCH NEXT FROM CUR INTO @Sndx

    END

    CLOSE CUR

    DEALLOCATE CUR

    Thanks

  • I use an ActiveX component called ActiveGender for this. It only costs $250, and it can deal with a lot of variations like hyphenated names, multi-part names, etc. Like Kathleen O'Hara-De La Croix, or Bob & Mary Smith.

    It's not perfect, you still have to look for special characters to tell it what format to expect (i.e. "first middle last", "last, first middle", "first middle & first middle last", etc). But it does a good job (given the correct format hint) of deciphering single or double names and identifying prefixes and suffixes (sr jr, mr, mrs, dr, MD, etc).

  • Any thought of cleaning the string of Suffix (Mr, Mrs) and prefix (MD, Jr) and just using SOUNDEX?

  • We built our own in VB, to get the best matches we strip out all the punctuation, convert Bob to Robert, Street to St (for addresses), etc, and have several different combinations we match on.

    Andy

    http://www.sqlservercentral.com/columnists/awarren/

Viewing 5 posts - 1 through 4 (of 4 total)

You must be logged in to reply to this topic. Login to reply