Converting a XML datatype to varchar(max).

  • ben.brugman (8/23/2016)

    I would like a solution without the FOR XML, is there one ?

    I think there is. I generally avoid scalar user defined functions like the plague but check this out. First for some sample data:

    CREATE TABLE dbo.strings

    (

    stringID int identity NOT NULL,

    strGroup smallint NOT NULL,

    string varchar(36) NOT NULL,

    CONSTRAINT pk_strings PRIMARY KEY CLUSTERED(strGroup, stringID)

    );

    INSERT dbo.strings (strGroup, string)

    SELECT TOP (1000000)

    strGroup = ABS(checksum(newid())%20000)+1,

    string = REPLACE(LEFT(newid(), ABS(checksum(newid())%40)+1),'-','')

    FROM sys.all_columns a, sys.all_columns b;

    The my clustered index is key here, we want to avoid a sort when grouping by strGroup.

    Now the Scalar UDF:

    CREATE FUNCTION dbo.fnConcatinateRows(@id smallint, @delimiter varchar(10))

    RETURNS varchar(8000) WITH SCHEMABINDING AS

    BEGIN

    DECLARE @cString varchar(8000)='';

    SELECT @cString += @delimiter + string

    FROM dbo.strings

    WHERE strGroup = @id;

    RETURN SUBSTRING(@cString,LEN(@delimiter)+1,8000);

    END

    GO

    -- examples

    SELECT dbo.fnConcatinateRows(3,',');

    SELECT dbo.fnConcatinateRows(3,'<CR><LF>');

    Next we'll check out the I/O when comparing my scalar UDF to the XML PATH method.

    SET STATISTICS IO ON;

    PRINT 'fnConcatinateRows';

    PRINT REPLICATE('----------',5);

    SELECT strGroup, dbo.fnConcatinateRows(strGroup,',')

    FROM dbo.strings

    GROUP BY strGroup;

    PRINT CHAR(13)+CHAR(10)+'XML PATH without special XML Protection, serial plan';

    PRINT REPLICATE('----------',5);

    SELECT

    strGroup,

    STUFF((

    SELECT ','+string

    FROM dbo.strings b

    WHERE a.strGroup = b.strGroup

    FOR XML PATH('')),1,1,'')

    FROM dbo.strings a

    GROUP BY strGroup;

    PRINT CHAR(13)+CHAR(10)+'XML PATH with special XML Protection, serial plan';

    PRINT REPLICATE('----------',5);

    SELECT

    strGroup,

    STUFF((

    SELECT ','+string

    FROM dbo.strings b

    WHERE a.strGroup = b.strGroup

    FOR XML PATH(''), TYPE

    ).value('./text()[1]','varchar(8000)'),1,1,'')

    FROM dbo.strings a

    GROUP BY strGroup;

    SET STATISTICS IO OFF;

    fnConcatinateRows

    --------------------------------------------------

    Table 'strings'. Scan count 1, logical reads 4577, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.

    XML PATH without special XML Protection, serial plan

    --------------------------------------------------

    Table 'strings'. Scan count 20001, logical reads 69222, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.

    XML PATH with special XML Protection, serial plan

    --------------------------------------------------

    Table 'strings'. Scan count 20001, logical reads 69222, physical reads 0, read-ahead reads 0, lob logical reads 0, lob physical reads 0, lob read-ahead reads 0.

    Well, well... 4,577 reads for the scalar udf vs 69,222 with the XML PATH method (regardless of special character protection) :w00t::w00t::w00t::w00t::w00t::w00t::w00t:

    Now let's check the speed.

    PRINT 'fnConcatinateRows';

    PRINT REPLICATE('----------',5);

    GO

    DECLARE @st datetime2 = getdate(), @sg smallint, @x varchar(8000);

    SELECT

    @sg = strGroup,

    @x = dbo.fnConcatinateRows(strGroup,',')

    FROM dbo.strings

    GROUP BY strGroup;

    PRINT DATEDIFF(MS,@st,getdate());

    GO 3

    PRINT CHAR(13)+CHAR(10)+'XML PATH without special XML Protection, serial plan';

    PRINT REPLICATE('----------',5);

    GO

    DECLARE @st datetime2 = getdate(), @sg smallint, @x varchar(8000);

    SELECT

    @sg = strGroup,

    @x = STUFF((

    SELECT ','+string

    FROM dbo.strings b

    WHERE a.strGroup = b.strGroup

    FOR XML PATH('')),1,1,'')

    FROM dbo.strings a

    GROUP BY strGroup;

    PRINT DATEDIFF(MS,@st,getdate());

    GO 3

    PRINT CHAR(13)+CHAR(10)+'XML PATH without special XML Protection, parallel plan';

    PRINT REPLICATE('----------',5);

    GO

    DECLARE @st datetime2 = getdate(), @sg smallint, @x varchar(8000);

    SELECT

    @sg = strGroup,

    @x = STUFF((

    SELECT ','+string

    FROM dbo.strings b

    WHERE a.strGroup = b.strGroup

    FOR XML PATH('')),1,1,'')

    FROM dbo.strings a

    GROUP BY strGroup

    OPTION (RECOMPILE, QUERYTRACEON 8649);

    PRINT DATEDIFF(MS,@st,getdate());

    GO 3

    PRINT CHAR(13)+CHAR(10)+'XML PATH with special XML Protection, serial plan';

    PRINT REPLICATE('----------',5);

    GO

    DECLARE @st datetime2 = getdate(), @sg smallint, @x varchar(8000);

    SELECT

    @sg = strGroup,

    @x = STUFF((

    SELECT ','+string

    FROM dbo.strings b

    WHERE a.strGroup = b.strGroup

    FOR XML PATH(''), TYPE

    ).value('./text()[1]','varchar(8000)'),1,1,'')

    FROM dbo.strings a

    GROUP BY strGroup

    PRINT DATEDIFF(MS,@st,getdate());

    GO 3

    PRINT CHAR(13)+CHAR(10)+'XML PATH with special XML Protection, parallel plan';

    PRINT REPLICATE('----------',5);

    GO

    DECLARE @st datetime2 = getdate(), @sg smallint, @x varchar(8000);

    SELECT

    @sg = strGroup,

    @x = STUFF((

    SELECT ','+string

    FROM dbo.strings b

    WHERE a.strGroup = b.strGroup

    FOR XML PATH(''), TYPE

    ).value('./text()[1]','varchar(8000)'),1,1,'')

    FROM dbo.strings a

    GROUP BY strGroup

    OPTION (RECOMPILE, QUERYTRACEON 8649);

    PRINT DATEDIFF(MS,@st,getdate());

    GO 3

    fnConcatinateRows

    --------------------------------------------------

    Beginning execution loop

    637

    647

    626

    Batch execution completed 3 times.

    XML PATH without special XML Protection, serial plan

    --------------------------------------------------

    Beginning execution loop

    1053

    1093

    1054

    Batch execution completed 3 times.

    XML PATH without special XML Protection, parallel plan

    --------------------------------------------------

    Beginning execution loop

    557

    590

    557

    Batch execution completed 3 times.

    XML PATH with special XML Protection, serial plan

    --------------------------------------------------

    Beginning execution loop

    1450

    1367

    1363

    Batch execution completed 3 times.

    XML PATH without special XML Protection, parallel plan

    --------------------------------------------------

    Beginning execution loop

    770

    774

    763

    Batch execution completed 3 times.

    In this (hastily prepared) performance test the scalar UDF is faster with a serial plan (you can't get a parallel plan when a scalar udf is involved) EXCEPT when running the FOR XML PATH technique without special XML character protection with a parallel plan which produces marginally better results (but more reads and a lot more CPU overhead).

    I think, in this rare case, the scalar UDF is the way to go.

    "I cant stress enough the importance of switching from a sequential files mindset to set-based thinking. After you make the switch, you can spend your time tuning and optimizing your queries instead of maintaining lengthy, poor-performing code."

    -- Itzik Ben-Gan 2001

  • Thanks, it's clear now.

    It must be a bad habit of going every time via variables with explicitly defined types (either scalar or table ) which prevented me from hitting this wall before.

    Indeed, if we skip @x and try to pull it directly from the subquery, it will fail.

    _____________
    Code for TallyGenerator

  • Hello Alan and others,

    First thanks for supplying the scripts and the timings.

    Great.

    Alan.B (8/24/2016)


    Well, well... 4,577 reads for the scalar udf vs 69,222 with the XML PATH method (regardless of special character protection) :w00t::w00t::w00t::w00t::w00t::w00t::w00t:

    In this (hastily prepared) performance test the scalar UDF is faster with a serial plan (you can't get a parallel plan when a scalar udf is involved) EXCEPT when running the FOR XML PATH technique without special XML character protection with a parallel plan which produces marginally better results (but more reads and a lot more CPU overhead).

    For my specific situation a string can be long. So I made some alterations to your scripts.

    Instead of 20000 groups I changed that to 20 groups. (A change of 20000 to 20).

    Instead of the results in varchar(8000) I changed the results to varchar(MAX). (A change of 8000 to MAX).

    And added the statement: delete strings where strGroup <> 3 (To speed the process up a bit.)

    ------ 'fnConcatinateRows';

    This resulted on my machine on CPU time = 91105 ms, elapsed time = 91153 ms.

    -- PRINT CHAR(13)+CHAR(10)+'XML PATH without special XML Protection, serial plan';

    CPU time = 46 ms, elapsed time = 46 ms.

    -- PRINT CHAR(13)+CHAR(10)+'XML PATH without special XML Protection, parallel plan';

    CPU time = 46 ms, elapsed time = 47 ms.

    (A change of :w00t::w00t::w00t::w00t::w00t::w00t::w00t: to :(:(:(:(:(:(:( )

    This was also done with some haste, so please do not trust these results.

    (Done in haste on a SQL 2008/R2 installation)

    But can you confirm this.

    Again, thanks for the scripts, only used them in 'haste', but I think they are a solid fundament to work with.

    Ben

  • Additional scripting for testing the special characters, to be run after the table is populated:

    --

    -- After creating and populating the table I reduce the number of groups TO 1

    --

    delete strings where strGroup <> 3

    --

    -- Create some prepared strings which show special XML Protection

    --

    delete strings where stringid >= 20 and stringid < 40

    SET IDENTITY_INSERT strings ON

    insert into strings (stringID,strgroup,string) select 20,3,'ABCDEFG'

    insert into strings (stringID,strgroup,string) select 21,3,'less and greater ABC<>DEFG'

    insert into strings (stringID,strgroup,string) select 22,3,'ampersand ABC&DEFG'

    insert into strings (stringID,strgroup,string) select 23,3,'comma ABC,DEFG'

    insert into strings (stringID,strgroup,string) select 24,3,'semicolon ABC;DEFG'

    insert into strings (stringID,strgroup,string) select 25,3,'space ABC DEFG'

    insert into strings (stringID,strgroup,string) select 26,3,'qoute ABC''DEFG'

    insert into strings (stringID,strgroup,string) select 27,3,'double qoute ABC"DEFG'

    insert into strings (stringID,strgroup,string) select 28,3,'two double qoutes ABC""DEFG'

    insert into strings (stringID,strgroup,string) select 29,3,'<CR><LF> ABC'+CHAR(13)+CHAR(10)+'DEFG'

    insert into strings (stringID,strgroup,string) select 30,3,'LF ABC'+CHAR(10)+'DEFG'

    insert into strings (stringID,strgroup,string) select 31,3,'CR ABC'+CHAR(13)+'DEFG'

    insert into strings (stringID,strgroup,string) select 32,3,'tab ABC'+CHAR(9)+'DEFG'

    SET IDENTITY_INSERT strings OFF

    Alan has prepared distinct scripts for handling special characters, these additions make some of the differences visible. (I still have to study the behaviour of <CR><LF> and <CR> and <LF> constructions).

    Ben

Viewing 4 posts - 16 through 18 (of 18 total)

You must be logged in to reply to this topic. Login to reply