March 29, 2015 at 4:27 pm
Comments posted to this topic are about the item Search a string in a text with number of occurrences
Thanks.
April 6, 2015 at 2:52 pm
Here's an alternative that uses a tally table:
;with dataLen as (select len(@inputString) i, len(@searchString) s)
select
'POSITION OF THE SEARCH STRING >> "' + @searchString + '" IS AT :' + cast(tt.i as varchar(100)),
count(tt.i) over (partition by null) numberFound
from dbo.talleyTable(len(@inputString)) tt
cross join dataLen dl
where tt.i between 1 and (dl.i - dl.s + 1)
and left(right(@inputString, dl.i - tt.i + 1), dl.s) = @searchString;
Don Simpson
April 6, 2015 at 3:43 pm
Adam Machanic has a clr table valued function to split a string on a delimiter(http://sqlblog.com/blogs/adam_machanic/archive/2009/04/26/faster-more-scalable-sqlclr-string-splitting.aspx). Here's an adaptation of his clr that I put together in vb.net to get to the same information your procedure does in a different way that I think is a little bit more flexible. Just another thought on solving the problem.
Imports System
Imports System.Data
Imports System.Data.SqlClient
Imports System.Data.SqlTypes
Imports Microsoft.SqlServer.Server
Partial Public Class UserDefinedFunctions
Public Class stringManipulation
Implements IEnumerator
Private inputString As String
Private delimiter As String
Private nextPos As Integer
Private lastPos As Integer
Private stringProp As New stringProperties()
Public Sub New(inputString As String, delimiter As String)
Me.inputString = inputString
Me.delimiter = delimiter
Me.lastPos = -1
Me.nextPos = -1
End Sub
Public ReadOnly Property Current As Object Implements System.Collections.IEnumerator.Current
Get
Return DirectCast(stringProp, Object)
End Get
End Property
Public Function MoveNext() As Boolean Implements System.Collections.IEnumerator.MoveNext
If (nextPos >= inputString.Length) Then
Return False
Else
lastPos = nextPos + delimiter.Length
If (lastPos = inputString.Length) Then
Return False
End If
nextPos = inputString.IndexOf(delimiter, lastPos)
If nextPos = -1 Then
Return False
Else
stringProp.sequenceProp += 1
stringProp.indexPositionProp = nextPos + 1
End If
Return True
End If
End Function
Public Sub Reset() Implements System.Collections.IEnumerator.Reset
Me.lastPos = -1
Me.nextPos = -1
End Sub
End Class
Public Class stringProperties
Private sequence As Integer
Private indexPosition As Integer
Public Property sequenceProp() As Integer
Get
Return Me.sequence
End Get
Set(value As Integer)
Me.sequence = value
End Set
End Property
Public Property indexPositionProp() As Integer
Get
Return Me.indexPosition
End Get
Set(value As Integer)
Me.indexPosition = value
End Set
End Property
End Class
<Microsoft.SqlServer.Server.SqlFunction(FillRowMethodName:="findString_fillRow", _
TableDefinition:="sequence int, indexPosition int")> _
Public Shared Function findString(
inputString As SqlString,
<SqlFacet(MaxSize:=100)> delimiter As SqlString
) As IEnumerator
If inputString.IsNull Or delimiter.IsNull Or delimiter.ToString.Length > inputString.ToString.Length Then
Return (New stringManipulation("", ","))
Else
Return (New stringManipulation(inputString.ToString, delimiter.ToString))
End If
End Function
Public Shared Sub findString_fillRow(strCollectionObj As Object, ByRef sequence As SqlInt32, ByRef indexPosition As SqlInt32)
Dim stringProps As stringProperties = DirectCast(strCollectionObj, stringProperties)
sequence = stringProps.sequenceProp
indexPosition = stringProps.indexPositionProp
End Sub
End Class
Here's example results for finding the sql index of every space in the string passed to the tvf:
SELECT *
FROM dbo.findString('Here Are The Laws You Need To Know If You Bought A Defective Product',' ')
Here's some test data followed by several examples of manipulating the data returned by the tvf. jalopnikHeadline refers to headlines found on http://www.jalopnik.com/:
IF OBJECT_ID(N'dbo.findStringTest',N'U') IS NOT NULL
DROP TABLE dbo.findStringTest
GO
CREATE TABLE dbo.findStringTest
(
[id] TINYINT IDENTITY(1,1) NOT NULL
,[jalopnikHeadline] VARCHAR(255) NOT NULL
,CONSTRAINT pk_findStringTest PRIMARY KEY CLUSTERED([id])
)
GO
INSERT INTO dbo.findStringTest
(
[jalopnikHeadline]
)
SELECT 'Apple''s Sensor-Covered Minivans Are Invading Texas'
UNION
SELECT 'Exactly Where The 2015 Ford F-150 Lost Weight, Below The Aluminum Body'
UNION
SELECT 'Oh, Good: We Might Hear More Music From Lewis Hamilton About His Ex'
UNION
SELECT 'Tech Blogger Says He Watched As Youths Wirelessly Broke Into His Car'
UNION
SELECT 'Here Are The Laws You Need To Know If You Bought A Defective Product'
UNION
SELECT 'How To Drive An Illegal-*** Car For Years Without Getting Busted'
UNION
SELECT 'Here Are Ten Of The Best Dream Cars On eBay For Less Than $80,000'
UNION
SELECT 'Let American Hero Jack Diamond Teach You How To Flip A Car Properly'
GO
-- Review the inserted data
SELECT *
FROM dbo.findStringTest
-- Full results returned by the tvf with cross apply
SELECT *
FROM dbo.findStringTest
OUTER APPLY dbo.findString([jalopnikHeadline],' ') fs
-- Returning comma separated indexes for a single string into a variable
DECLARE @searchIndexes NVARCHAR(100) = ''
SELECT @searchIndexes += ',' + CAST(fs.[indexPosition] AS VARCHAR)
FROM dbo.findString('This is a test string for returning the position of a delimiter in a string n times ',' ') fs
PRINT('@searchIndexes: ' + SUBSTRING(@searchIndexes,2,LEN(@searchIndexes)))
-- Returning comma separated indexes for a single string using xml path
SELECT SUBSTRING((
SELECT ',' + CAST([indexPosition] AS VARCHAR)
FROM dbo.findString('This is a test string for returning the position of a delimiter in a string n times ',' ')
FOR XML PATH('')
),2,4000)
-- Returning comma separated indexes for strings in a query
SELECT *
,(SELECT SUBSTRING((
SELECT ',' + CAST(fs.[indexPosition] AS VARCHAR)
FROM dbo.findStringTest fsti
OUTER APPLY dbo.findString(fsti.[jalopnikHeadline],' ') fs
WHERE fsti.[id] = fst.[id]
FOR XML PATH('')
),2,4000)) AS [searchIndexes]
FROM dbo.findStringTest fst
-- Pivot data returned from the function from multiple rows into one row per id with a column for each index
;WITH indexScrapedMaster
AS
(
SELECT [id],[jalopnikHeadline] AS [inputString]
,fs.[sequence],fs.[indexPosition]
FROM dbo.findStringTest
OUTER APPLY dbo.findString([jalopnikHeadline],' ') fs
)
SELECT [id],[inputString]
[1],[2],[3],[4],[5]
,[6],[7],[8],[9],[10]
,[11],[12],[13],[14],[15]
FROM
(
SELECT [id],[inputString]
,CAST([sequence] AS VARCHAR) AS [indexSequence]
,[indexPosition]
FROM indexScrapedMaster
) st
PIVOT
(
MAX([indexPosition]) FOR [indexSequence] IN
(
[1],[2],[3],[4],[5]
,[6],[7],[8],[9],[10]
,[11],[12],[13],[14],[15]
)
) pvt
ORDER BY [id]
GO
IF OBJECT_ID(N'dbo.findStringTest',N'U') IS NOT NULL
DROP TABLE dbo.findStringTest
GO
April 7, 2015 at 3:32 pm
Just adding a solution that goes a little bit further than Don's.
WITH
E(n) AS(
SELECT n FROM (VALUES(0),(0),(0),(0),(0),(0),(0),(0),(0),(0)) E(n) --10 rows
),
E2(n) AS(
SELECT a.n FROM E a, E b --10 x 10 = 100 rows
),
E4(n) AS(
SELECT a.n FROM E2 a, E2 b --100 x 100 = 10,000 rows
),
cteTally AS(
SELECT TOP(len(@inputString) - len(@searchString) + 1) --Limit the number of rows
ROW_NUMBER() OVER( ORDER BY (SELECT NULL)) i
FROM E4
),
cteResults AS(
select
CAST( count(tt.i) over (partition by null) AS varchar(5)) numberFound ,
tt.i
from cteTally tt
where SUBSTRING(@inputString, tt.i, len(@searchString)) = @searchString
)
SELECT Result
FROM cteResults
CROSS APPLY( VALUES(1, 'POSITION OF THE SEARCH STRING >> "' + @searchString + '" IS AT : ' + cast(i as varchar(5))),
(2, 'THE SEARCHED STRING APPEARED >> ' + numberFound + ' TIMES'),
(3, '**** END OF SEARCH ***') ) cav(roworder, Result)
GROUP BY Result, roworder
ORDER BY roworder, MIN(i)
April 16, 2016 at 11:20 pm
I know thread is a little old but this is exactly the kind of thing you could do with a good NGrams8K function. Here's the function:
CREATE FUNCTION dbo.NGrams8k
(
@string varchar(8000), -- Input string
@N int -- requested token size
)
/****************************************************************************************
Purpose:
A character-level @N-Grams function that outputs a contiguous stream of @N-sized tokens
based on an input string (@string). Accepts strings up to 8000 varchar characters long.
For more information about N-Grams see: http://en.wikipedia.org/wiki/N-gram.
Compatibility:
SQL Server 2008+ and Azure SQL Database
Syntax:
--===== Autonomous
SELECT position, token FROM dbo.NGrams8k(@string,@N);
--===== Against a table using APPLY
SELECT s.SomeID, ng.position, ng.string
FROM dbo.SomeTable s
CROSS APPLY dbo.NGrams8K(s.SomeValue,@N) ng;
Parameters:
@string = The input string to split into tokens.
@N = The size of each token returned.
Returns:
Position = bigint; the position of the token in the input string
token = varchar(8000); a @N-sized character-level N-Gram token
Examples:
--===== Turn the string, 'abcd' into unigrams, bigrams and trigrams
SELECT position, token FROM dbo.NGrams8k('abcd',1); -- unigrams (@N=1)
SELECT position, token FROM dbo.NGrams8k('abcd',2); -- bigrams (@N=2)
SELECT position, token FROM dbo.NGrams8k('abcd',3); -- trigrams (@N=1)
--===== How many times the substring "AB" appears in each record
DECLARE @table TABLE(stringID int identity primary key, string varchar(100));
INSERT @table(string) VALUES ('AB123AB'),('123ABABAB'),('!AB!AB!'),('AB-AB-AB-AB-AB');
SELECT string, occurances = COUNT(*)
FROM @table t
CROSS APPLY dbo.NGrams8k(t.string,2) ng
WHERE ng.token = 'AB'
GROUP BY string;
Developer Notes:
1. This function is not case sensitive
2. Many functions that use NGrams8k will see a huge performance gain when the optimizer
creates a parallel query plan. One way to get a parallel query plan (if the optimizer
does not chose one) is to use make_parallel by Adam Machanic which can be found here:
sqlblog.com/blogs/adam_machanic/archive/2013/07/11/next-level-parallel-plan-porcing.aspx
3. When @N is less than 1 or greater than the datalength of the input string then no
tokens (rows) are returned.
4. This function can also be used as a tally table with the position column being your
"N" row. To do so use REPLICATE to create an imaginary string, then use NGrams8k to
split it into unigrams then only return the position column. NGrams8k will get you up
to 8000 numbers. There will be no performance penalty for sorting by position in
ascending order but there is for sorting in descending order. To get the numbers in
descending order without forcing a sort in the query plan use the following formula:
N = <highest number>-position+1.
Pseudo Tally Table Examples:
--===== (1) Get the numbers 1 to 100 in ascending order:
SELECT N = position FROM dbo.NGrams8k(REPLICATE(0,100),1);
--===== (2) Get the numbers 1 to 100 in descending order:
DECLARE @maxN int = 100;
SELECT N = @maxN-position+1
FROM dbo.NGrams8k(REPLICATE(0,@maxN),1)
ORDER BY position;
-- note: you don't need a variable, I used one to make the example easier to understand.
----------------------------------------------------------------------------------------
Revision History:
Rev 00 - 20140310 - Initial Development - Alan Burstein
Rev 01 - 20150522 - Removed DQS N-Grams functionality, improved iTally logic. Also Added
conversion to bigint in the TOP logic to remove implicit conversion
to bigint - Alan Burstein
Rev 03 - 20150909 - Added logic to only return values if @N is greater than 0 and less
than the length of @string. Updated comment section. - Alan Burstein
Rev 04 - 20151029 - Added ISNULL logic to the TOP clause for the @string and @N
parameters to prevent a NULL string or NULL @N from causing "an
improper value" being passed to the TOP clause. - Alan Burstein
****************************************************************************************/
RETURNS TABLE WITH SCHEMABINDING AS RETURN
WITH
L1(N) AS
(
SELECT 1
FROM (VALUES -- 90 NULL values used to create the CTE Tally table
(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),
(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),
(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),
(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),
(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),
(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),
(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),
(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),
(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL)
) t(N)
),
iTally(N) AS -- my cte Tally table
(
SELECT TOP(ABS(CONVERT(BIGINT,(DATALENGTH(ISNULL(@string,''))-(ISNULL(@N,1)-1)),0)))
ROW_NUMBER() OVER (ORDER BY (SELECT NULL)) -- Order by a constant to avoid a sort
FROM L1 a CROSS JOIN L1 b -- cartesian product for 8100 rows (90^2)
)
SELECT
position = N, -- position of the token in the string(s)
token = SUBSTRING(@string,CAST(N AS int),@N) -- the @N-Sized token
FROM iTally
WHERE @N > 0 AND @N <= DATALENGTH(@string); -- force the function to ignore a "bad @N"
Here's a purely set-based way to find a string within a string along with it's location in the string.
DECLARE
@inputString varchar(800) = 'abc123xyz123',
@searchString varchar(800) = '123';
SELECT *
FROM dbo.NGrams8k(@inputString, LEN(@searchstring))
WHERE token = @searchstring;
-- Itzik Ben-Gan 2001
Viewing 5 posts - 1 through 4 (of 4 total)
You must be logged in to reply to this topic. Login to reply