Comparing Stored Procedures, Part 5
This picks up from an earlier article on trying to quantify the comparison of two stored procedures. This script creates a stored procedure which contains a critical code optimization to an earlier version..
For a more complete explanation, please go to my blog entry here.
Written by Jesse McLain
jesse@jessemclain.com
www.jessemclain.com
http://jessesql.blogspot.com
/*************************************************************************************************
** Proc: "spd_SequenceCompare_Version2"
** Desc: This stored procedure will compare the items in two sequences. The sequences should
** be stored in two tables, Seq1 and Seq2, which should have the following structure:
**
** CREATE TABLE Seq1 (
** CodeLineTxt varchar(max), /* stores the original line of code */** CodeLineNum int not null identity(1,1), /* stores the line number */** MatchLineNum int /* stores the matching line of code from spd #2 */** )
**
** This spd will compare smaller and smaller subsequences in the tables, and when it
** finds a match, it will note it in the column MatchLineNum.
**
**
**
** Return values: none
**
** Called by:
**
** Parameters:
** Input
** ----------
** none
**
** Output
** -----------
** none
**
** Auth: Jesse McLain
** Email: jesse@jessemclain.com
** Web: www.jessemclain.com
** Blog: http://jessesql.blogspot.com/2009/02/comparing-spds-part-5-optimization.html
**
** Date: 02/16/2008
**
***************************************************************************************************
** Change History
***************************************************************************************************
** Date: Author: Description:
** -------- -------- -------------------------------------------
** 20080216 Jesse McLain Created script
** 20080223 Jesse McLain Added CTE code to better determine @CmprSz value
***************************************************************************************************
**************************************************************************************************/
CREATE PROCEDURE spd_SequenceCompare_Version2 AS
SET NOCOUNT ON
DECLARE @CmprSz int
DECLARE @idx1 int
DECLARE @idx2 int
DECLARE @idx2_last int
DECLARE @idx3 int
DECLARE @NumCmprs int
DECLARE @cdtxt1 varchar(max);
DECLARE @AllMatch tinyint;
DECLARE @Seq1Size int; SELECT @Seq1Size = COUNT(*) FROM Seq1
DECLARE @Seq2Size int; SELECT @Seq2Size = COUNT(*) FROM Seq1
/* 2/17/09 - I'm going to try out several enhancements to improve performance of this
spd. First I'm going to run a pre-flight matching between the members of the two
sequences, and then omit any subsequence checks with non-matching members. What this
would also allow is lowering the 'compare size' (@CmpSz) to the length of the maximum
subsequence without any non-matches (actually the lesser of the max between the two
sequences - CORRECTION - the max lens of the matching subsequences will be the same
between the 2 main sequences). This is where our greatest cost savings will be. */
/* first we'll update each sequence so that the MatchLineNum column for every value
of CodeLineTxt will indicate whether there exists a match in the other sequence. */UPDATE Seq1
SET MatchLineNum = CASE
WHEN EXISTS (SELECT 1 FROM Seq2 WHERE Seq2.CodeLineTxt = Seq1.CodeLineTxt) THEN -1
ELSE 0 END
UPDATE Seq2
SET MatchLineNum = CASE
WHEN EXISTS (SELECT 1 FROM Seq1 WHERE Seq1.CodeLineTxt = Seq2.CodeLineTxt) THEN -1
ELSE 0 END
/* 2/23/09 - this block of CTE code will determine the max lengths of subsequences of
the main sequences containing matching items in the other sequence. Previously we would
set @CmprSz to the size of the smaller of the two sequences. What we're doing now is
checking all of the possible matching subsequences (by checking for matches above in
a one-to-one fashion), taking the size of the longest possible subsequence
this page helped with the CTE code: http://www.sqlservercentral.com/articles/T-SQL/61461/ */
;WITH
Seq1starts AS (
SELECT CodeLineNum
FROM Seq1 S1
WHERE MatchLineNum = -1
AND NOT EXISTS (SELECT * FROM Seq1 S2
WHERE MatchLineNum = -1
AND S2.CodeLineNum = S1.CodeLineNum - 1
)
),
Seq1ends AS (
SELECT CodeLineNum = CodeLineNum
FROM Seq1 S1
WHERE MatchLineNum = -1
AND NOT EXISTS (SELECT * FROM Seq1 S2
WHERE MatchLineNum = -1
AND S2.CodeLineNum = S1.CodeLineNum + 1
)
),
Seq1sequences AS (
SELECT
SeqStartsAt = Seq1starts.CodeLineNum,
SeqEndsAt = (SELECT MIN(Seq1ends.CodeLineNum)
FROM Seq1ends
WHERE Seq1ends.CodeLineNum >= Seq1starts.CodeLineNum)
FROM Seq1starts
),
Seq1maxlenseq AS (
SELECT MaxLen = MAX(SeqEndsAt - SeqStartsAt + 1)
FROM Seq1sequences
),
Seq2starts AS (
SELECT CodeLineNum
FROM Seq2 S1
WHERE MatchLineNum = -1
AND NOT EXISTS (SELECT * FROM Seq2 S2
WHERE MatchLineNum = -1
AND S2.CodeLineNum = S1.CodeLineNum - 1
)
),
Seq2ends AS (
SELECT CodeLineNum = CodeLineNum
FROM Seq2 S1
WHERE MatchLineNum = -1
AND NOT EXISTS (SELECT * FROM Seq2 S2
WHERE MatchLineNum = -1
AND S2.CodeLineNum = S1.CodeLineNum + 1
)
),
Seq2sequences AS (
SELECT
SeqStartsAt = Seq2starts.CodeLineNum,
SeqEndsAt = (SELECT MIN(Seq2ends.CodeLineNum)
FROM Seq2ends
WHERE Seq2ends.CodeLineNum >= Seq2starts.CodeLineNum)
FROM Seq2starts
),
Seq2maxlenseq AS (
SELECT MaxLen = MAX(SeqEndsAt - SeqStartsAt + 1)
FROM Seq2sequences
)
SELECT @CmprSz = CASE WHEN Seq1maxlenseq.MaxLen < Seq2maxlenseq.MaxLen THEN Seq1maxlenseq.MaxLen ELSE Seq2maxlenseq.MaxLen END
FROM Seq1maxlenseq, Seq2maxlenseq
UPDATE Seq1 SET MatchLineNum = 0
UPDATE Seq2 SET MatchLineNum = 0
/* IF @CmprSz IS NULL then we know that no matches exist between the sequences, and can thus exit early */IF @CmprSz IS NULL
RETURN
-- here for testing; remove later:
PRINT '@CmprSz = ' + LTRIM(STR(@CmprSz))
/* 2/16/09 - the idea here is to compare blocks of lines from one table to
blocks from the other table. we start with the blocks of largest size
and continue decreasing size until we get to blocks of one line each.
Whenever a block matches that of a block in the other table, all
matching lines are flagged to indicate the match, so as to exclude
those matches from subsequent comparisons.
*/
WHILE @CmprSz > 0
BEGIN
SET @NumCmprs = @Seq1Size - @CmprSz + 1
SET @idx1 = 1
WHILE @idx1 <= @NumCmprs
BEGIN
/* ensure that every element of subsequence is not yet matched: */ IF (SELECT COUNT(*) FROM Seq1 WHERE CodeLineNum BETWEEN @idx1 AND (@idx1 + @CmprSz - 1) AND MatchLineNum = 0) = @CmprSz
BEGIN
SELECT @cdtxt1 = CodeLineTxt FROM Seq1 WHERE CodeLineNum = @idx1
/* we need to find the first match in table 2. since there can be multiple rows in table 2 that match
the head of the subset from table 1, we need to check for multiples: */ SET @idx2_last = 0
SET @idx2 = 0
SET @AllMatch = 0
WHILE @AllMatch = 0 AND @idx2 IS NOT NULL
BEGIN
SELECT @idx2 = MIN(CodeLineNum)
FROM Seq2
WHERE CodeLineTxt = @cdtxt1
AND CodeLineNum > @idx2_last
AND CodeLineNum <= (@Seq2Size - @CmprSz + 1) /* ensures that table 2 subset is big enough */ AND MatchLineNum = 0
IF @idx2 IS NOT NULL
BEGIN
SET @idx3 = 1
/* now check all matches. the easiest way to implement this checking is to first
assume that all lines match, and then record that fact that at least one does not. */ SET @AllMatch = 1
WHILE @idx3 <= @CmprSz AND @AllMatch = 1
BEGIN
IF NOT EXISTS (SELECT 1
FROM Seq1 T1
JOIN Seq2 T2
ON T1.CodeLineTxt = T2.CodeLineTxt
AND T1.CodeLineNum = (@idx1 + @idx3 - 1)
AND T2.CodeLineNum = (@idx2 + @idx3 - 1))
SET @AllMatch = 0
SET @idx3 = @idx3 + 1
END
IF @AllMatch = 1
BEGIN
UPDATE Seq1 SET MatchLineNum = @idx2
WHERE CodeLineNum BETWEEN @idx1 AND (@idx1 + @CmprSz - 1)
UPDATE Seq2 SET MatchLineNum = @idx1
WHERE CodeLineNum BETWEEN @idx2 AND (@idx2 + @CmprSz - 1)
END
END
SET @idx2_last = @idx2
END /* WHILE @AllMatch = 1 AND @idx2 IS NOT NULL */ END /* IF (SELECT COUNT(*) FROM Seq1 WHERE CodeLineNum BETWEEN ... */
SET @idx1 = @idx1 + 1
END /* WHILE @idx1 <= @NumCmprs */
SET @CmprSz = @CmprSz - 1
END /* WHILE @CmprSz > 0 */
RETURN