July 23, 2013 at 9:27 am
Hi,
I'm importing some XML files into SQL which have an XSD schema which I am validating against. The "issue" I've found is that some of the data in the files is contained within CDATA tags but the corresponding XML element is defined as an xsd:token (from W3Schools - "The token data type also contains characters, but the XML processor will remove line feeds, carriage returns, tabs, leading and trailing spaces, and multiple spaces."). What I noticed was that when I ran a simple XQuery in SQL against the Typed XML variable it was applying the xsd:token rules of removing multiple spaces to the CDATA values. Before I go back to the supplier of the file I just wanted to double check what the correct result should be. See below snippet of code...
CREATEXML SCHEMA COLLECTION dbo.MyTestSchema
AS
N'
<xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema">
<xsd:element name="ROOT">
<xsd:complexType>
<xsd:complexContent>
<xsd:restriction base="xsd:anyType">
<xsd:sequence>
<xsd:element ref="Test" minOccurs="0" maxOccurs="unbounded"/>
</xsd:sequence>
</xsd:restriction>
</xsd:complexContent>
</xsd:complexType>
</xsd:element>
<xsd:element name="Test" type="TestType"/>
<xsd:simpleType name="TestType">
<xsd:restriction base="xsd:token">
</xsd:restriction>
</xsd:simpleType>
</xsd:schema>'
GO
DECLARE@XMLData varchar(MAX) =
'<ROOT>
<Test><![CDATA[0spaces]]></Test>
<Test><![CDATA[1 space]]></Test>
<Test><![CDATA[2 spaces]]></Test>
<Test><![CDATA[3 spaces]]></Test>
</ROOT>'
DECLARE@XML xml = @XMLData
DECLARE@MyTestXML xml(CONTENT dbo.MyTestSchema) = @XMLData
;WITHWithoutSchema AS
(
SELECT
[Test] = NULLIF(T2.n.value('.', 'varchar(10)'), '')
FROM@XML.nodes('/ROOT') AS T1(n)
CROSSAPPLY T1.n.nodes('Test') AS T2(n)
),
WithSchema AS
(
SELECT
[Test] = NULLIF(T2.n.value('.', 'varchar(10)'), '')
FROM@MyTestXML.nodes('/ROOT') AS T1(n)
CROSSAPPLY T1.n.nodes('Test') AS T2(n)
)
SELECT[WithoutSchema] = N.Test, [WithSchema] = Y.Test
FROMWithoutSchema N
INNERJOIN WithSchema Y
ONREPLACE(N.Test, ' ', '') = REPLACE(Y.Test, ' ', '')
GO
DROPXML SCHEMA COLLECTION dbo.MyTestSchema
GO
...the results are...
WithoutSchema WithSchema
------------- ----------
0spaces 0spaces
1 space 1 space
2 spaces 2 spaces
3 spaces 3 spaces
...as you can see, using a non-typed xml variable preserves the spaces in the CDATA text, but using the typed variable (which uses xsd:token) is stripping them out. Should that be happening, I thought xsd was only applied to non-CDATA values? The spaces have a meaning in the data we are loading so if that is correct behaviour I need to raise with the supplier.
Many Thanks!
July 23, 2013 at 10:22 am
The parsing procedure I generally use (Parse XML to Table[/url]) does NOT preserve the spaces, so I made a slightly modified version that gives you the choice. I really don't know if that is normal XML behavior, but this should let you get around that issue.
CREATE PROCEDURE [dbo].[ParseXMLtoTableWithCDATA]
@strXML AS XML
,@rootnode NVARCHAR(255)
,@preserveCDATA BIT
AS
BEGIN
/*
EXEC dbo.ParseXMLtoTableWithCDATA
N'<ROOT>
<Test><![CDATA[0spaces]]></Test>
<Test><![CDATA[1 space]]></Test>
<Test><![CDATA[2 spaces]]></Test>
<Test><![CDATA[3 spaces]]></Test>
</ROOT>'
,'ROOT'
,1
EXEC dbo.ParseXMLtoTableWithCDATA
N'<ROOT>
<Test><![CDATA[0spaces]]></Test>
<Test><![CDATA[1 space]]></Test>
<Test><![CDATA[2 spaces]]></Test>
<Test><![CDATA[3 spaces]]></Test>
</ROOT>'
,'ROOT'
,0
EXEC dbo.ParseXMLtoTableWithCDATA
'<items>
<item id="0001" type="Donut">
<name><![CDATA[Cake]]></name>
<ppu><![CDATA[0.55]]></ppu>
<batter id="1001"><![CDATA[Regular]]></batter>
<batter id="1002"><![CDATA[Chocolate]]></batter>
<batter id="1003"><![CDATA[Blueberry]]></batter>
<topping id="5001"><![CDATA[None]]></topping>
<topping id="5002"><![CDATA[Glazed Donut]]></topping>
<topping id="5005"><![CDATA[Sugar Cookie]]></topping>
<topping id="5006"><![CDATA[Sprinkles]]></topping>
<topping id="5003"><![CDATA[Chocolate Cup Cake]]></topping>
<topping id="5004"><![CDATA[Maple Syrup]]></topping>
</item>
</items>'
,'items'
,0
EXEC dbo.ParseXMLtoTableWithCDATA
'<items>
<item id="0001" type="Donut">
<name><![CDATA[Cake]]></name>
<ppu><![CDATA[0.55]]></ppu>
<batter id="1001"><![CDATA[Regular]]></batter>
<batter id="1002"><![CDATA[Chocolate]]></batter>
<batter id="1003"><![CDATA[Blueberry]]></batter>
<topping id="5001"><![CDATA[None]]></topping>
<topping id="5002"><![CDATA[Glazed Donut]]></topping>
<topping id="5005"><![CDATA[Sugar Cookie]]></topping>
<topping id="5006"><![CDATA[Sprinkles]]></topping>
<topping id="5003"><![CDATA[Chocolate Cup Cake]]></topping>
<topping id="5004"><![CDATA[Maple Syrup]]></topping>
</item>
</items>'
,'items'
,1
*/
SET NOCOUNT ON
DECLARE
@strText AS NVARCHAR(MAX)
,@idoc INT
,@id INT
,@parentid INT
IF OBJECT_ID('tempdb..#ChildList') IS NOT NULL
DROP TABLE #ChildList
CREATE TABLE #ChildList (
[RowNum] INT IDENTITY(1,1) NOT NULL,
[parentid] INT NULL,
[id] INT NULL,
PRIMARY KEY (RowNum))
IF OBJECT_ID('tempdb..#NodeList') IS NOT NULL
DROP TABLE #NodeList
CREATE TABLE #NodeList (
[RowNum] INT NOT NULL,
[id] INT NULL,
[parentid] INT NULL,
[nodetype] INT NULL,
[localname] NVARCHAR(MAX) NULL,
[text] NVARCHAR(MAX) NULL,
PRIMARY KEY (RowNum))
SET @id = 1
SET @parentid = NULL
IF @preserveCDATA <> 0
SET @preserveCDATA = 1
/* Get rid of tabs and extra spaces */
SET @strText = CAST(@strXML AS NVARCHAR(MAX))
IF @preserveCDATA = 0
BEGIN
SET @strText =
REPLACE(
REPLACE(
REPLACE(
REPLACE(
@strText
,' ',' '+CHAR(7))
,CHAR(7)+' ','')
,CHAR(7),'')
,CHAR(9),' ')
END
SET @strXML = CONVERT(XML,@strText)
/* Validate the XML */
EXEC sp_xml_preparedocument @idoc OUTPUT, @strXML
/* Parse the XML data */
;WITH cteChildren (parentid, id)
AS (
SELECT
CAST(p1.parentid AS INT) AS parentid
,CAST(p1.id AS INT) AS id
FROM
OPENXML (@idoc,@rootnode,2) AS p1
UNION ALL
SELECT
CAST(p2.parentid AS INT) AS parentid
,CAST(p2.id AS INT) AS id
FROM
OPENXML (@idoc,@rootnode,2) AS p2
INNER JOIN
cteChildren AS cte
ON cte.id = p2.ParentID
WHERE
p2.parentid = @parentid
)
INSERT INTO #ChildList
SELECT
parentid
,id
FROM cteChildren
INSERT INTO #NodeList
SELECT
#ChildList.RowNum
,xmllist.id
,xmllist.parentid
,xmllist.nodetype
,xmllist.localname
,CAST(xmllist.[text] AS NVARCHAR(MAX)) AS [text]
FROM #ChildList
INNER JOIN
OPENXML (@idoc,@rootnode,2) AS xmllist
ON #ChildList.id = xmllist.id
WHERE
#ChildList.RowNum > 0
/* Display the results */
IF OBJECT_ID('dbo.XML_Nodes') IS NOT NULL
DROP TABLE dbo.XML_Nodes
;WITH RecursiveNodes(RowNum,id,parentid,nodepath,localname,[text],nodetype)
AS (
SELECT
#NodeList.RowNum
,#NodeList.id
,#NodeList.parentid
,CAST('/' + REPLACE(REPLACE(REPLACE(REPLACE(#NodeList.localname,'&',''),'?',''),' ',''),'.','') AS NVARCHAR(255)) AS nodepath
,#NodeList.localname
,CAST(#NodeList.[text] AS NVARCHAR(MAX)) AS [text]
,0 AS nodetype
FROM #ChildList
INNER JOIN
#NodeList
ON #ChildList.id = #NodeList.id
WHERE
#NodeList.parentid IS NULL
AND #ChildList.RowNum > 0
AND #NodeList.RowNum > 0
UNION ALL
SELECT
n.RowNum
,n.id
,n.parentid
,CAST(r.nodepath + '/'+ REPLACE(REPLACE(REPLACE(REPLACE(n.localname,'&',''),'?',''),' ',''),'.','') AS NVARCHAR(255)) AS nodepath
,n.localname
,n.[text]
,n.nodetype
FROM #NodeList AS n
INNER JOIN
RecursiveNodes AS r
ON n.parentid = r.id
WHERE
n.RowNum > 0
AND r.RowNum > 0
AND n.parentid >= 0
)
SELECT
ROW_NUMBER() OVER (ORDER BY Result.RowNum) AS RowNum
,Result.id
,Result.parentid
,Result.nodepath
,Result.nodetype
,Result.nodename
,Result.property
,Result.value
,Result.nodecontents
INTO dbo.XML_Nodes
FROM
(
SELECT
rn.RowNum
,rn.id
,rn.parentid
,rn.nodepath
,(CASE
WHEN rn.nodetype = 0 THEN 'Root'
WHEN rn.nodetype = 1 THEN 'Node'
WHEN rn.nodetype = 2 THEN 'Property'
ELSE 'Data'
END) AS nodetype
,(CASE
WHEN rn.nodetype = 0 THEN rn.localname
WHEN rn.nodetype = 1 THEN rn.localname
WHEN rn.nodetype = 2 THEN (SELECT TOP(1) localname FROM RecursiveNodes WHERE id = rn.parentid)
ELSE NULL
END) AS nodename
,(CASE
WHEN rn.nodetype = 2 THEN rn.localname
ELSE NULL
END) AS property
,(CASE
WHEN rn.nodetype = 2 THEN (SELECT TOP(1) [text] FROM RecursiveNodes WHERE parentid = rn.id)
ELSE NULL
END) AS value
,(CASE
WHEN rn.nodetype = 1 THEN (SELECT TOP(1) [text] FROM RecursiveNodes WHERE parentid = rn.id)
WHEN rn.nodetype = 2 THEN (SELECT TOP(1) [text] FROM RecursiveNodes WHERE parentid = rn.parentid and [text] is not null)
ELSE NULL
END) AS nodecontents
FROM
RecursiveNodes AS rn
WHERE
rn.localname <> '#text'
) AS Result
WHERE
Result.id >= 0
AND (Result.id = 0
OR property IS NOT NULL
OR value IS NOT NULL
OR nodecontents IS NOT NULL)
SELECT
RowNum
,id
,parentid
,nodepath
,nodetype
,nodename
,property
,value
,nodecontents
FROM
dbo.XML_Nodes
END
Viewing 2 posts - 1 through 1 (of 1 total)
You must be logged in to reply to this topic. Login to reply