May 31, 2006 at 2:19 pm
I have a 4 million row table with emails from different sources.
I need to clean it up and only have valid email addresses like
abc@abc.com anything else has to be deleted.
Here is the sample of data in my table.
liaosdnjgabgsdg
;5252@fdsgg
3264272
dsgsadhaf.com
@ouasfj.com
There are more weird entries.
I've already Ltrim and Rtrim the fields.
How can i clean this table ?
THANK YOU
May 31, 2006 at 2:42 pm
Here's a suggestion:
- Get the DBA Toolkit at http://www.sqlservercentral.com/columnists/mcoles/sql2000dbatoolkitpart2.asp
- You can validate email addresses based on a regular expression similar to the following:
SELECT dbo.fn_regex_match ('^[A-Z0-9._%-]+@[A-Z0-9.-]+\.[A-Z]{2,4}$', 'validemail@sqlservercentral.com', 'I+')
If you're using SQL 2005 you can use SQLCLR to create a regex function instead and CROSS APPLY to execute the function directly on the columns.
With SQL 2000 you'll need to play around with it.
June 1, 2006 at 1:16 am
Hi,
you can use the regular expression with a user function like this:
--select dbo.fnTestStringWithRegularExpression('pippo123', 4, 'T', 'T')
ALTER function [dbo].[fnTestStringWithRegularExpression]
(
@stringaDaControllare varchar(4096), -- stringa da controllare
@tipoStringa smallint, -- 1 = email
-- 2 = numero di telefono italiano
-- 3 = numero di telefono US
-- 4 = stringa di lettere e numeri di almeno 8 caratteri
@ricercaGlobale char(1), -- T = cerca tutte le occorrenze
-- F = cerca solo la prima occorrenza
@ignoreCaseSensitive char(1) -- T = ricerca ignorando case sensitive
-- F = ricerca senza ignorare case sensitive
 
returns char(1) -- T = stringa da controllare ok
-- F = stringa da controllare non corretta
as
begin
-- espressioni di controllo in funzione del tipo di stringa da verificare
declare @espressioneDiControllo varchar(1048)
set @espressioneDiControllo = ''
if (@tipoStringa = 1)
begin
-- set @espressioneDiControllo = '^([a-zA-Z0-9_\-])+(\.([a-zA-Z0-9_\-])+)*@((\[(((([0-1])?([0-9])?[0-9])|(2[0-4][0-9])|(2[0-5][0-5])))\.(((([0-1])?([0-9])?[0-9])|(2[0-4][0-9])|(2[0-5][0-5])))\.(((([0-1])?([0-9])?[0-9])|(2[0-4][0-9])|(2[0-5][0-5])))\.(((([0-1])?([0-9])?[0-9])|(2[0-4][0-9])|(2[0-5][0-5]))\]))|((([a-zA-Z0-9])+(([\-])+([a-zA-Z0-9])+)*\.)+([a-zA-Z])+(([\-])+([a-zA-Z0-9])+)*))$' --'^(([A-Za-z0-9]+_+)|([A-Za-z0-9]+\-+)|([A-Za-z0-9]+\.+)|([A-Za-z0-9]+\++))*[A-Za-z0-9]+@((\w+\-+)|(\w+\.))*\w{1,63}\.[a-zA-Z]{2,6}$'
-- ==============================================================================================================================
set @espressioneDiControllo = '^([a-zA-Z0-9_\-])+(\.([a-zA-Z0-9_\-])+)*@((\w+\-+)|(\w+\.))*\w{1,63}\.[a-zA-Z]{2,6}$'
end
if (@tipoStringa = 2)
begin
-- numero di telefono italiano
set @espressioneDiControllo = '^([0-9]*\-?\ ?\/?[0-9]*)$'
end
if (@tipoStringa = 3)
begin
-- numero di telefono US
set @espressioneDiControllo = '^(??<1>[(])?(?<AreaCode>[2-9]\d{2})(?(1)[)])(?(1)(?<2>[ ])|(??<3>[-])|(?<4>[ ])))?)?(?<Prefix>[1-9]\d{2})(?(AreaCode)(??(1)(?(2)[- ]|[-]?))|(?(3)[-])|(?(4)[- ]))|[- ]?)(?<Suffix>\d{4})$'
end
if (@tipoStringa = 4)
begin
-- stringa di lettere e numeri di almmeno 8 caratteri
set @espressioneDiControllo = '(?!^[0-9]*$)(?!^[a-zA-Z]*$)^([a-zA-Z0-9]{8,20})$'
end
if (@espressioneDiControllo <> '')
begin
-- esegue il controllo
declare @hr integer
declare @objRegExp integer
declare @results bit
declare @output char(1)
-- creazione oggetto com per le regular expression
exec @hr = sp_OACreate 'VBScript.RegExp', @objRegExp output
if (@hr <> 0)
begin
set @results = 0
goto USCITA
end
-- imposta le proprietà dell'oggetto com
-- pattern rispetto cui testare
exec @hr = sp_OAsetProperty @objRegExp, 'Pattern', @espressioneDiControllo
if (@hr <> 0)
begin
set @results = 0
goto USCITA
end
-- verifica globale o si ferma alla prima occorrenza
if (@ricercaGlobale = 'T')
begin
exec @hr = sp_OAsetProperty @objRegExp, 'Global', True
end
else
begin
exec @hr = sp_OAsetProperty @objRegExp, 'Global', False
end
if (@hr <> 0)
begin
set @results = 0
goto USCITA
end
-- ignora le lettere maiuscole
if (@ignoreCaseSensitive = 'T')
begin
exec @hr = sp_OAsetProperty @objRegExp, 'IgnoreCase', True
end
else
begin
exec @hr = sp_OAsetProperty @objRegExp, 'IgnoreCase', False
end
if (@hr <> 0)
begin
set @results = 0
goto USCITA
end
-- esegue il controllo della stringa chiamando il metodo Test
exec @hr = sp_OAMethod @objRegExp, 'Test', @results output, @stringaDaControllare
if (@hr <> 0)
begin
set @results = 0
goto USCITA
end
-- distrugge l'oggetto com
exec @hr = sp_OADestroy @objRegExp
if (@hr <> 0)
begin
set @results = 0
goto USCITA
end
end
else
begin
-- tipologia non prevista
set @results = 0
goto USCITA
end
USCITA:
if (@results = 1)
begin
set @output = 'T'
end
else
begin
set @output = 'F'
end
return @output
end
June 1, 2006 at 7:26 am
I can't comment on the proposed solutions, but if my email address
is in your database, it's in the format
myname@agency.state.statecode.us
I find it extremely annoying when I try to submit that as my email
address, and am told that it's invalid.
Mattie
June 1, 2006 at 7:32 am
Hi,
i test my user function and i reveived OK:
select dbo.fnTestStringWithRegularExpression('myname@agency.state.statecode.us', 1, 'T', 'T')
tell T, so the email address is ok.
Bye
June 1, 2006 at 8:00 am
Two comments about the sp_OACreate method -
1) VBScript doesn't support many regex features that are standard in Perl (lookbehind, atomic grouping, possessive quantifiers, \A, \Z, etc.)
2) sp_OACreate can cause memory leaks:
http://support.microsoft.com/kb/897572
http://support.microsoft.com/kb/816937/EN-US/
Make sure you have SP 4 installed and that sp_Destroy is getting called every time if you use this method. I've seen it shut down production servers on a daily basis.
June 1, 2006 at 9:20 am
@vc table(vc varchar(50))
@vc
'dhskjdhfkss' union all
'dhsk@fkss' union all
select ';dhsk@fkss.ooo' union all
'dh-sk@fk-ss.ooo' union all
'dh\sk@fk-ss.ooo' union all
'@fkss.das' union all
'dhsk@fkss.a.asd.asd.aaaaaa' union all
'dhsk@fkss.ddd..dddd' union all
'dh+k@fkss.ddd.d' union all
'dhsk@fkss.ddd.kk.dd' union all
'dhsk@f#ss.ddd' union all
'd--k@f_ss.ddd' union all
'------@__-ss.___._._._' union all
'dhsk@f#ss.ddd' union all
'dh_k@fkss.ddd' union all
'dhsk@fkss.' union all
'.dhsk@fkss.nnn' union all
'sgsgaha.sksjkajd.d.d.dhsk@fkss.nnn.dsdsd.as' union all
'd.h.s.k@f.k.s.s.f' union all
'd@f.s' union all
'd.hs@fkss.d' union all
vc.vc VALID____________________________
@vc vc
not vc.vc like '%[^a-Z0-9.@!_-]%' escape '!' --contains only alphanumeric, '-', '.' and '@'
not vc.vc like '%@%@%' --has no more than one '@' sign
not vc.vc like '%..%' --no adjacent dots
not vc.vc like '.%' --no leading dot
and vc.vc like '%[^.]@[^.]%._%' --has an '@' without dots next to it, and to its right a dot followed by another char.
vc.vc INVALID__________________________
@vc vc
vc.vc like '%[^a-Z0-9.@!_-]%' escape '!' --contains only alphanumeric, '-', '.' and '@'
vc.vc like '%@%@%' --has no more than one '@' sign
vc.vc like '%..%' --no adjacent dots)
vc.vc like '.%' --no leading dot
not vc.vc like '%[^.]@[^.]%._%' --has an '@' without dots next to it, and a dot afterwards.
Tim Wilkinson
"If it doesn't work in practice, you're using the wrong theory"
- Immanuel Kant
June 1, 2006 at 8:51 pm
THANK YOU ALL FOR RESPONDING.
Viewing 8 posts - 1 through 7 (of 7 total)
You must be logged in to reply to this topic. Login to reply