Replacing misspellings in a string

  • Hello:

    I'm working on standardizing address fields in a large contact database.

    I have a list of common misspellings that occur in the StreetName field and

    I'd like to replace them with the proper spelling. The problem is that these

    misspellings are part of a string, so I have to make sure that I'm only replacing

    occurrences in which the misspelling comprises an entire word.

    For example, I could just run:

    UPDATE Address

    SET StreetName = REPLACE(StreetName,'Roda','Road')

    WHERE StreetName LIKE '%Roda%'

    ...but if a StreetName is "Rodanthe Drive", this update would change it to

    "Roadnthe Drive". I found a function to convert to proper case that separates

    out words in a string, so I'd like to use that functionality in conjunction with a

    series of replace functions. Here's the proper case function (created by David

    Wiseman at http://www.wisesoft.co.uk/scripts/tsql_proper_case_udf.aspx):

    CREATE FUNCTION [dbo].[fProperCase](@Value varchar(8000),

    @Exceptions varchar(8000),@UCASEWordLength tinyint)

    returns varchar(8000)

    as

    begin

    declare @sep char(1) -- Separator character for exceptions

    declare @i int -- counter

    declare @ProperCaseText varchar(5000) -- Used to build Proper Case string for Function return

    declare @Word varchar(1000) -- Temporary storage for each word

    declare @IsWhiteSpace as bit -- Used to indicate whitespace character/start of new word

    declare @C char(1) -- Temp storage location for each character

    set @Word = ''

    set @i = 1

    set @IsWhiteSpace = 1

    set @ProperCaseText = ''

    set @sep = '|'

    -- Set default UPPERCASEWord Length

    if @UCASEWordLength is null set @UCASEWordLength = 1

    -- Convert user input to lower case (This function will UPPERCASE words as required)

    set @Value = LOWER(@Value)

    -- Loop while counter is less than text lenth (for each character in...)

    while (@i <= len(@Value)+1)
    begin

    -- Get the current character
    set @C = SUBSTRING(@Value,@i,1)

    -- If start of new word, UPPERCASE character

    if @IsWhiteSpace = 1 set @C = UPPER(@c)

    -- Check if character is white space/symbol (using ascii values)

    set @IsWhiteSpace = case when (ASCII(@c) between 48 and 58) then 0

    when (ASCII(@c) between 64 and 90) then 0

    when (ASCII(@c) between 96 and 123) then 0

    else 1 end

    if @IsWhiteSpace = 0

    begin

    -- Append character to temp @Word variable if not whitespace

    set @Word = @Word + @C

    end

    else

    begin

    -- Character is white space/punctuation/symbol which marks the end of current word.

    -- If word length is less than or equal to UPPERCASE word length, convert to upper case.

    -- e.g. can specify a @UCASEWordLength of 3 to automatically UPPERCASE all 3 letter words.

    set @Word = case when len(@Word) <= @UCASEWordLength then UPPER(@Word) else @Word end -- Check word against user exceptions list. If exception is found, use case specified in exception.
    -- e.g. WiseSoft, RAM, CPU.
    -- If word isn't in user exceptions list, check for "known" exceptions.
    set @Word = case when charindex(@sep + @Word + @sep,@exceptions collate Latin1_General_CI_AS) > 0

    then substring(@exceptions,charindex(@sep + @Word + @sep,@exceptions collate Latin1_General_CI_AS)+1,len(@Word))

    when @Word = 's' and substring(@Value,@i-2,1) = '''' then 's' -- e.g. Who's

    when @Word = 't' and substring(@Value,@i-2,1) = '''' then 't' -- e.g. Don't

    when @Word = 'm' and substring(@Value,@i-2,1) = '''' then 'm' -- e.g. I'm

    when @Word = 'll' and substring(@Value,@i-3,1) = '''' then 'll' -- e.g. He'll

    when @Word = 've' and substring(@Value,@i-3,1) = '''' then 've' -- e.g. Could've

    else @Word end

    -- Append the word to the @ProperCaseText along with the whitespace character

    set @ProperCaseText = @ProperCaseText + @Word + @C

    -- Reset the Temp @Word variable, ready for a new word

    set @Word = ''

    end

    -- Increment the counter

    set @i = @i + 1

    end

    return @ProperCaseText

    end

    I tried to use it like this:

    begin

    declare @Value varchar(5000)

    declare @i int -- counter

    declare @Text varchar(5000)

    declare @Word varchar(1000)

    declare @IsWhiteSpace as bit

    declare @C char(1)

    set @Word = ''

    set @i = 1

    set @IsWhiteSpace = 1

    set @Text = ''

    while (@i <= len(@Value)+1)
    begin

    set @C = SUBSTRING(@Value,@i,1)

    set @IsWhiteSpace = case when (ASCII(@c) between 48 and 58) then 0

    when (ASCII(@c) between 64 and 90) then 0

    when (ASCII(@c) between 96 and 123) then 0

    else 1 end

    if @IsWhiteSpace = 0

    begin

    set @Word = @Word + @C

    end

    else

    begin

    UPDATE Address SET StreetName = REPLACE(StreetName,@Word,'Avenue')

    where @Word IN ('avbe','ave','avemue','aven','avene','aveneu','avenie','avenueue','avenuje','avenune',

    'avenur','avenuse','avenut','aves','aveue','aveune','aveunue','avneue','avrenue','avvenue','svenue')

    UPDATE Address SET StreetName = REPLACE(StreetName,@Word,'Street')

    where @Word IN ('sreet','srteet','steeet','sterrt','streeet','streen','streer','strees','streete','streetm',

    'streetq','streeyt','strett','strreet')

    UPDATE Address SET StreetName = REPLACE(StreetName,@Word,'Extension')

    where @Word IN ('extenstion','extention')

    UPDATE Address SET StreetName = REPLACE(StreetName,@Word,'Road')

    where @Word IN ('raod','roade','roadn','roadx')

    UPDATE Address SET StreetName = REPLACE(StreetName,@Word,'Boulevard')

    where @Word IN ('blouvard','blv','blve','bouevard','boulavard','bouleard','boulevarad','boulvard','bvld')

    UPDATE Address SET StreetName = REPLACE(StreetName,@Word,'Drive')

    where @Word IN ('deive','dirve','dribe','drie','driove','drivde','drivem','driveq','drivev','drivw','drove',

    'druve','drvie','drvive','frive','srive')

    set @Text = @Text + @Word + @C

    set @Word = ''

    end

    set @i = @i + 1

    end

    end

    When I run my version, it says "Command completed successfully," but nothing actually gets replaced.

    I'm certain that there are syntactical errors in my code--I don't claim to know what I'm doing!--

    but since I'm not getting any errors, I don't know what to change. Please please please help!!

    Thanks!

  • Could you add space on either side of the string you are replacing? That would eliminate most false catches.

    UPDATE Address

    SET StreetName = REPLACE(StreetName,' Roda ',' Road ')

    WHERE StreetName LIKE '% Roda %'

    [font="Arial Narrow"]bc[/font]

  • Hi, bc:

    Thanks for the reply! I was considering that as a last-ditch effort, but here's the problem. We send these addresses out to the post office periodically to be formatted and they split the address fields, so "123 West Pine Avenue" would become:

    Field1: 123

    Field2: W

    Field3: PINE

    Field4: AVE

    When the street type (Avenue, Street, Drive, Boulevard, etc.) is misspelled, the post office's database doesn't catch it, so it doesn't separate the fields correctly. So if we have "123 West Pine Aveneu", it'll come back like this:

    Field1: 123

    Field2: W

    Field3: PINE AVENEU

    Field4: NULL

    So in order to catch the misspellings, I would have to search for '% Aveneu %', 'Aveneu %', and '% Aveneu'. I was hoping to avoid having to include all three scenarios for each possible misspelling (which is where the whitespace code from the proper case function I cited would come in), but if I can't figure out an alternative, I guess that's what I'll have to do!

    Thanks!

  • Just a last desperate attempt to get some feedback on this issue before I start slogging through all the possible spacing scenarios... :crying:

  • i have already done something very similar...this code may help out, or just complicate matters...

    it's a huge stack of updates to a temp table, so I could review the old addr vs new "Cleaned up" address to determine whether i should update or not.

    view the attachment, it's just too big to paste, but here's a couple of examples for reference:

    update AddressTest set DistilledAddress=replace(DistilledAddress,' blvd.',' Boulevard')

    update AddressTest set DistilledAddress=replace(DistilledAddress,' blvd',' Boulevard')

    update AddressTest set DistilledAddress=replace(DistilledAddress,'bl.',' Boulevard')

    --lots of misspelling patterns here

    update AddressTest set DistilledAddress=replace(DistilledAddress,'pobox',' P.O. Box')

    update AddressTest set DistilledAddress=replace(DistilledAddress,'po box',' P.O. Box')

    update AddressTest set DistilledAddress=replace(DistilledAddress,'p.o.box',' P.O. Box')

    Lowell


    --help us help you! If you post a question, make sure you include a CREATE TABLE... statement and INSERT INTO... statement into that table to give the volunteers here representative data. with your description of the problem, we can provide a tested, verifiable solution to your question! asking the question the right way gets you a tested answer the fastest way possible!

  • Hi, Lowell:

    Thank you--this will save me a lot of typing! I also enjoyed your use of the word "doink" in your comments. 😛 This is very much appreciated!

  • might speed up a little to parameterize and then toggle between the SELECTs for review and then UPDATE:

    DECLARE@old VARCHAR(20)

    , @new VARCHAR(20)

    SET@old = 'Roda'

    SET@new = 'Road'

    UPDATEAddressSETStreetName = REPLACE(StreetName,' '+@old+' ',' '+@new+' ')

    --SELECT *

    FROMAddress

    WHEREStreetName LIKE '% '+@old+' %' --space either side

    UPDATEAddressSETStreetName = REPLACE(StreetName,' '+@old,' '+@new)

    --SELECT *

    FROMAddress

    WHEREStreetName LIKE '% '+@old --ends with

    UPDATEAddressSETStreetName = REPLACE(StreetName,@old+' ',@new+' ')

    --SELECT *

    FROMAddress

    WHEREStreetName LIKE @old+' %' --starts with

    [font="Arial Narrow"]bc[/font]

  • lk (6/30/2009)


    Hi, Lowell:

    Thank you--this will save me a lot of typing! I also enjoyed your use of the word "doink" in your comments. 😛 This is very much appreciated!

    your welcome!

    it sucks to reinvent the wheel, that's for sure.

    let us know how it works for youy, and any tweaks you add; someone else is eventually going to have the same requirement.

    Lowell


    --help us help you! If you post a question, make sure you include a CREATE TABLE... statement and INSERT INTO... statement into that table to give the volunteers here representative data. with your description of the problem, we can provide a tested, verifiable solution to your question! asking the question the right way gets you a tested answer the fastest way possible!

  • Holy mother, I think I might have gotten it. I changed David Wiseman's proper case function to this:

    CREATE FUNCTION [dbo].[fMisspelling](@Value varchar(255), @Exceptions

    varchar(255))

    returns varchar(5000)

    as

    begin

    declare @sep char(1)

    declare @i int

    declare @Text varchar(5000)

    declare @Word varchar(1000)

    declare @IsWhiteSpace as bit

    declare @C char(1)

    set @Word = ''

    set @i = 1

    set @IsWhiteSpace = 1

    set @Text = ''

    set @sep = '|'

    -- Loop while counter is less than text lenth (for each character in...)

    while (@i 0

    then substring(@exceptions,charindex(@sep + @Word +

    @sep,@exceptions collate Latin1_General_CI_AS)+1,len(@Word))

    when @Word = ('and') then '&'

    when @Word = ('ctr') then 'Center'

    when @Word IN

    ('avbe','ave','avemue','aven','avene','aveneu','avenie','avenueue','avenuje','avenune',

    'avenur','avenuse','avenut','aves','aveue','aveune','aveunue','avneue','avrenue','avvenue',

    'svenue') then 'Avenue'

    when @Word IN ('sreet','srteet','steeet','sterrt','streeet','streen','streer','strees','streete','streetm','streetq',

    'streeyt','strett','strreet') then 'Street'

    when @Word IN ('extenstion','extention') then 'Extension'

    when @Word IN ('raod','roade','roadn','roadx') then 'Road'

    when @Word IN

    ('blouvard','blv','blve','bouevard','boulavard','bouleard','boulevarad','boulvard','bvld') then

    'Boulevard'

    when @Word = ('apartmenet') then 'Apartment'

    when @Word IN

    ('deive','dirve','dribe','drie','driove','drivde','drivem','driveq','drivev','drivw','drove','druve',

    'drvie','drvive','frive','srive') then 'Drive'

    when @Word = ('bx') then 'Box'

    when @Word IN ('ccourt','courtf','cout') then 'Court'

    when @Word IN ('biulding','bldg') then 'Building'

    when @Word IN ('cicle','circel','ciricle','cricle') then 'Circle'

    when @Word = ('trale') then 'Trail'

    else @Word end

    -- Append the word to the @Text along with the whitespace character

    set @Text = @Text + @Word + @C

    -- Reset the Temp @Word variable, ready for a new word

    set @Word = ''

    end

    -- Increment the counter

    set @i = @i + 1

    end

    return @Text

    end

    I ran UPDATE Address SET StreetName = dbo.fMisspelling(StreetName,null)

    ...and it looks like it worked! I have to run some tests to make sure it didn't screw anything up, but if it works the way I think it did, I'm buying myself a cookie!:-D

Viewing 9 posts - 1 through 8 (of 8 total)

You must be logged in to reply to this topic. Login to reply