DECLARE @String VARCHAR(MAX)
SELECT @String = 'When you are starting out to validate text input, perhaps to clean up a database, you will, at some point need to get down to the character level. Doing a simple character frequency of a string is pretty easy. all you need to do is to do is this '
SELECT [frequency] = COUNT(*),
[character] = '''' + CONVERT(CHAR(1), SUBSTRING(@String, number, 1)) + ''''
FROM numbers
WHERE number < LEN(@string + '|')
GROUP BY SUBSTRING(@String, number, 1)
ORDER BY COUNT(*) DESC
/*
frequency character
----------- ---------
23 ' '
8 'e'
7 't'
7 's'
7 'i'
7 'o'
6 'a'
... ...
... ...
*/
--what about the ratio of vowels to consonants? We need to exclude white-space for this and we'll include Y as a vowel.
SELECT [vowel ratio] = CONVERT(NUMERIC(16, 2),
SUM(CASE
WHEN SUBSTRING(@String, number, 1)
IN ('a', 'e', 'i', 'o', 'u', 'y')
THEN 1
ELSE 0
END) * 100.00 / COUNT(*))
FROM numbers
WHERE number < LEN(@string + '|')
AND SUBSTRING(@String, number, 1) LIKE ('[A-Za-z]')
/* OK we can now test this with the entire text of Moby Dick. This will give us a reasonable figure for what one might expect from English*/
DECLARE @LotsOfText VARCHAR(MAX)
SELECT @LotsOfText = BulkColumn
-- SQL Server 2005/8 only for this code)
FROM OPENROWSET(BULK 'C:\workbench\moby-dick.txt', SINGLE_BLOB) AS x
SELECT [vowel ratio] = CONVERT(NUMERIC(16, 2),
SUM(CASE
WHEN SUBSTRING(@LotsOfText, number, 1)
IN ('a', 'e', 'i', 'o', 'u', 'y')
THEN 1
ELSE 0
END) * 100.00 / COUNT(*))
FROM numbers
WHERE number < LEN(@LotsOfText + '|')
AND SUBSTRING(@LotsOfText, number, 1) LIKE ('[A-Za-z]')
--vowel ratio
-------------
--39.20
/* Ok, with this tool, we have something that we can use to flag up text entry which is anomalous. What about people's names, though. This could be a lot trickier. After all. Dwight Ng is a valid name. That has a ratio of 12.5%*/
DROP TABLE PeoplesNames
CREATE TABLE PeoplesNames
(
firstname VARCHAR(50) NOT NULL,
lastname VARCHAR(50) NOT NULL,
[vowels] INT,
[characters] INT,
[Ratio] INT
)
--we'll import a million names in here just to test things out.
/*Now we add the vowel count, character count and then calculate the ratios*/
UPDATE PeoplesNames
SET vowels = (SELECT COUNT(*)
FROM numbers
WHERE number < LEN(Firstname + ' ' + lastname + '|')
AND SUBSTRING(Firstname + ' ' + lastname, number, 1)
IN ('a', 'e', 'i', 'o', 'u', 'y')
),
Characters = (SELECT COUNT(*)
FROM numbers
WHERE number < LEN(Firstname + ' ' + lastname + '|')
AND SUBSTRING(Firstname + ' ' + lastname, number, 1)
LIKE ('[A-Za-z]')
)
UPDATE PeoplesNames
SET ratio = vowels * 100.00 / CHARacters
/*
Dwight Schwartz--14%
Dwight Mc Knight--14%
Aimee Yu--85%
So now we can suspect any name less than 10% or more than 90% vowel. However, if you have stored the vowel ratios, then you can set your criteria for inspection as sensitive as you need without making the lights dim every time you do a query.
But this is a bit crude....*/
DECLARE @String VARCHAR(MAX)
SELECT @String = 'Another approach is to look at the frequency range of the two-letter combinations used in your text. You will find that there are only a limited number that are used in English'
SELECT [frequency] = COUNT(*),
[biChar] = '''' + CONVERT(CHAR(2), SUBSTRING(@String, number, 2)) + ''''
FROM numbers
WHERE number < LEN(@string + '|')
AND SUBSTRING(@String, number, 2) LIKE '[A-Za-z][A-Za-z]'
GROUP BY SUBSTRING(@String, number, 2)
ORDER BY COUNT(*) DESC
/* OK we can now test this with the entire text of Moby Dick. This will give us a reasonable frequency analysis for English*/
DECLARE @LotsOfText VARCHAR(MAX)
SELECT @LotsOfText = BulkColumn
-- SQL Server 2005/8 only for this code)
FROM OPENROWSET(BULK 'C:\workbench\moby-dick.txt', SINGLE_BLOB) AS x
SELECT [frequency] = COUNT(*),
[biChar] = '''' + CONVERT(CHAR(2), SUBSTRING(@LotsOfText, number, 2)) + ''''
FROM numbers
WHERE number < LEN(@LotsOfText + '|')
AND SUBSTRING(@LotsOfText, number, 2) LIKE '[A-Za-z][A-Za-z]'
GROUP BY SUBSTRING(@LotsOfText, number, 2)
ORDER BY COUNT(*) DESC
--490
/*if you run this, you'll see that only around a third of the possible two-letter combinations are used. Here it gets fun. You can now score your strings to see how closely they conform to the standard frequency for two-letter combinations. You could just create an index that deducts a point every time an unusual two-character combination is found in text. Hmmm. I think I've provided enough code for one morning! At this point it is worth experimenting to see what best fits your particular application */
Book Review: Big Red - Voyage of a Trident Submarine
I've grown up reading Tom Clancy and probably most of you have at least seen Red October, so this book caught my eye when browsing used books for a recent trip. It's a fairly human look at what's involved in sailing on a Trident missile submarine...
2009-03-10
1,439 reads