Comparing Stored Procedures, Part 2
This picks up from an earlier article on trying to quantify the comparison of two stored procedures.
For a more complete explanation, please go to my blog entry here.
Here's a sample of that blog entry that summarizes what this script does:
"I devised an algorithm that would compare blocks of lines between them, starting with the largest possible sequence to compare (the size of this would be the number of lines of the smaller spd), and then compare smaller and smaller sequences until the comparison size was just one line (which is what was compared in Comparing Stored Procedures, Part 1). The trick to this approach is that if a match was found between to large sequences of lines, the subsequences in them would never be compared. Although this certainly helps with performance, the real reason behind this is that the fact that large blocks of code matched is very meaningful. Take this example: the string 'ABCD' matches 'ABDC' more than it matches 'DCBA'. How much "more" it matches is the subject of a later posting, where we establish our scoring algorithm."
Written by Jesse McLain
jesse@jessemclain.com
www.jessemclain.com
www.jessesql.blogspot.com
/*************************************************************************************************
** File: Compare Sequences.sql
** Desc: Second in a series of scripts demonstrating a quantitative comparison between the text
** of two stored procedures
**
** Return values: report & results
**
** Called by:
**
** Parameters:
** Input
** ----------
** none
**
** Output
** -----------
** none
**
** Auth: Jesse mcLain
** Email: jesse@jessemclain.com
** Web: www.jessemclain.com
** Blog: www.jessesql.blogspot.com
** Date: 01/31/2008
**
***************************************************************************************************
** Change History
***************************************************************************************************
** Date: Author: Description:
** -------- -------- -------------------------------------------
** 20080213 Jesse McLain Created script
**************************************************************************************************/DECLARE @spd1 varchar(max)
SET @spd1 = 'Test Sequence "AZXYZBC"'
DECLARE @spd2 varchar(max)
SET @spd2 = 'Test Sequence "BCDAEAXYZ"'
SET NOCOUNT ON
CREATE TABLE #spd1 (
CodeLineTxt varchar(max), /* stores the original line of code */ CodeLineNum int not null identity(1,1), /* stores the line number */ MatchLineNum int /* stores the matching line of code from spd #2 */)
INSERT INTO #spd1 (CodeLineTxt) VALUES ('A')
INSERT INTO #spd1 (CodeLineTxt) VALUES ('Z')
INSERT INTO #spd1 (CodeLineTxt) VALUES ('X')
INSERT INTO #spd1 (CodeLineTxt) VALUES ('Y')
INSERT INTO #spd1 (CodeLineTxt) VALUES ('Z')
INSERT INTO #spd1 (CodeLineTxt) VALUES ('B')
INSERT INTO #spd1 (CodeLineTxt) VALUES ('C')
UPDATE #spd1 SET MatchLineNum = 0
CREATE TABLE #spd2 (
CodeLineTxt varchar(max),
CodeLineNum int not null identity(1,1),
MatchLineNum int
)
INSERT INTO #spd2 (CodeLineTxt) VALUES ('B')
INSERT INTO #spd2 (CodeLineTxt) VALUES ('C')
INSERT INTO #spd2 (CodeLineTxt) VALUES ('D')
INSERT INTO #spd2 (CodeLineTxt) VALUES ('A')
INSERT INTO #spd2 (CodeLineTxt) VALUES ('E')
INSERT INTO #spd2 (CodeLineTxt) VALUES ('A')
INSERT INTO #spd2 (CodeLineTxt) VALUES ('X')
INSERT INTO #spd2 (CodeLineTxt) VALUES ('Y')
INSERT INTO #spd2 (CodeLineTxt) VALUES ('Z')
UPDATE #spd2 SET MatchLineNum = 0
DECLARE @cnt1 int
SELECT @cnt1 = COUNT(*) FROM #spd1
DECLARE @cnt2 int
SELECT @cnt2 = COUNT(*) FROM #spd2
DECLARE @CmprSz int
SELECT @CmprSz = CASE WHEN @cnt1 < @cnt2 THEN @cnt1 ELSE @cnt2 END
DECLARE @idx1 int
DECLARE @idx2 int
DECLARE @idx2_last int
DECLARE @idx3 int
DECLARE @NumCmprs int
/* the idea here is to compare blocks of lines from one table to blocks from the other table. we start with the blocks of largest size
and continue decreasing size until we get to blocks of one line each. Whenever a block matches that of a block in the other table, all
matching lines are flagged to indicate the match, so as to exclude those matches from subsequent comparisons.
*/
WHILE @CmprSz > 0
BEGIN
SET @NumCmprs = @cnt1 - @CmprSz + 1
SET @idx1 = 1
WHILE @idx1 <= @NumCmprs
BEGIN
/* ensure that subset is not yet matched: */ IF (SELECT COUNT(*) FROM #spd1 WHERE CodeLineNum BETWEEN @idx1 AND (@idx1 + @CmprSz - 1) AND MatchLineNum = 0) = @CmprSz
BEGIN
DECLARE @cdtxt1 varchar(max); SELECT @cdtxt1 = CodeLineTxt FROM #spd1 WHERE CodeLineNum = @idx1
/* we need to find the first match in table 2. since there can be multiple rows in table 2 that match
the head of the subset from table 1, we need to check for multiples: */ SET @idx2_last = 0
SET @idx2 = 0
DECLARE @AllMatch tinyint; SET @AllMatch = 0
WHILE @AllMatch = 0 AND @idx2 IS NOT NULL
BEGIN
SELECT @idx2 = MIN(CodeLineNum)
FROM #spd2
WHERE CodeLineTxt = @cdtxt1
AND CodeLineNum > @idx2_last
AND CodeLineNum <= (@cnt2 - @CmprSz + 1) /* ensures that table 2 subset is big enough */ AND MatchLineNum = 0
IF @idx2 IS NOT NULL
BEGIN
SET @idx3 = 1
/* now check all matches. the easiest way to implement this checking is to first
assume that all lines match, and then record that fact that at least one does not. */ SET @AllMatch = 1
WHILE @idx3 <= @CmprSz AND @AllMatch = 1
BEGIN
IF NOT EXISTS (SELECT 1
FROM #spd1 T1
JOIN #spd2 T2
ON T1.CodeLineTxt = T2.CodeLineTxt
AND T1.CodeLineNum = (@idx1 + @idx3 - 1)
AND T2.CodeLineNum = (@idx2 + @idx3 - 1))
SET @AllMatch = 0
SET @idx3 = @idx3 + 1
END
IF @AllMatch = 1
BEGIN
UPDATE #spd1 SET MatchLineNum = @idx2
WHERE CodeLineNum BETWEEN @idx1 AND (@idx1 + @CmprSz - 1)
UPDATE #spd2 SET MatchLineNum = @idx1
WHERE CodeLineNum BETWEEN @idx2 AND (@idx2 + @CmprSz - 1)
END
END
SET @idx2_last = @idx2
END /* WHILE @AllMatch = 1 AND @idx2 IS NOT NULL */ END /* IF (SELECT COUNT(*) FROM #spd1 WHERE CodeLineNum BETWEEN ... */
SET @idx1 = @idx1 + 1
END /* WHILE @idx1 <= @NumCmprs */
SET @CmprSz = @CmprSz - 1
END /* WHILE @CmprSz > 0 */
PRINT 'spd1: ' + @spd1
PRINT 'spd2: ' + @spd2
PRINT ''
SELECT CodeLineTxt = CONVERT(varchar(10), CodeLineTxt), CodeLineNum, MatchLineNum FROM #spd1
SELECT CodeLineTxt = CONVERT(varchar(10), CodeLineTxt), CodeLineNum, MatchLineNum FROM #spd2
DECLARE @cnt1a int; SELECT @cnt1a = COUNT(*) FROM #spd1 T1 WHERE MatchLineNum <> 0
DECLARE @cnt2a int; SELECT @cnt2a = COUNT(*) FROM #spd2 T1 WHERE MatchLineNum <> 0
PRINT ''
PRINT 'Percentage match between the spds: ' + LTRIM(STR(100.0 * (@cnt1a / (1.0 * @cnt1) + @cnt2a / (1.0 * @cnt2)) / 2, 10, 2)) + '%'
PRINT 'Percentage of spd1 found in spd2: ' + LTRIM(STR(100.0 * @cnt1a / (1.0 * @cnt1), 10, 2)) + '%'
PRINT 'Percentage of spd2 found in spd1: ' + LTRIM(STR(100.0 * @cnt2a / (1.0 * @cnt2), 10, 2)) + '%'
PRINT ''
PRINT ''
DROP TABLE #spd1
DROP TABLE #spd2
SET NOCOUNT OFF