August 6, 2014 at 12:16 am
Hi,
i have 2 tables created in SQL Server 2014 viz., Staging table & yearweek. Year week gives the Yearweek, Yr & Week running numbers starting from year 2011 (created using Tally function). I am joining these 2 tables in below CTE to extract correct Sales week ranges that satisfy some criteria. The Staging table has data imported from multiple huge CSV's. It doesnt contain any primary keys. For every unique SKU (Description column), I am trying to find the different consecutive Sales weeks ranges containing Gaps (blanks or Zeroes) of <3 in them. If there is a gap of >=3, then it splits a range into 2. I have been able to find the correct gaps. The result of the CTE look like this:
SCOUNTRYSCHARDESCRIPTIONWEEKSCNTWKSMINWEEKMAXWEEK
SPAINGLOBAL016100361372201207201208
SPAINGLOBAL01610036134114201141201202
SPAINGLOBAL0161003613927201109201135
SPAINGLOBAL01610038501423201214201236
SPAINGLOBAL01610038504122201141201210
SPAINGLOBAL0241004245244201324201327
SPAINGLOBAL0241004245315201331201335
SPAINGLOBAL0241004245218201302201319
SPAINGLOBAL02410042453714201237201250
SPAINGLOBAL02410042582218201322201339
SPAINGLOBAL0241004258109201310201318
SPAINGLOBAL02410042584117201241201305
Attaching a Sample file containing the shown example :
Staging table structure:
USE [master]
GO
DROP TABLE [dbo].[staging]
GO
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
SET ANSI_PADDING ON
GO
CREATE TABLE [dbo].[staging](
[Level] [varchar](5) NULL,
[Week] [varchar](9) NULL,
[Category] [varchar](50) NULL,
[Manufacturer] [varchar](50) NULL,
[Brand] [varchar](50) NULL,
[Description] [varchar](100) NULL,
[EAN] [varchar](100) NULL,
[Sales Value with Innovation] [float] NULL,
[Sales Units with Innovation] [float] NULL,
[Price Per Item] [float] NULL,
[Importance Value w Innovation] [float] NULL,
[Importance Units w Innovation] [float] NULL,
[Numeric Distribution] [float] NULL,
[Weighted Distribution] [float] NULL,
[Average Number of Item] [float] NULL,
[Value] [float] NULL,
[Volume] [float] NULL,
[Units] [float] NULL,
[Sales Value New Manufacturer] [float] NULL,
[Sales Value New Brand] [float] NULL,
[Sales Value New Line Extension] [float] NULL,
[Sales Value New Packaging] [float] NULL,
[Sales Value New Size] [float] NULL,
[Sales Value New Product Form] [float] NULL,
[Sales Value New Style Type] [float] NULL,
[Sales Value New Flavour Fragr] [float] NULL,
[Sales Value New Claim] [float] NULL,
[Sales Units New Manufacturer] [float] NULL,
[Sales Units New Brand] [float] NULL,
[Sales Units New Line Extension] [float] NULL,
[Sales Units New Packaging] [float] NULL,
[Sales Units New Size] [float] NULL,
[Sales Units New Product Form] [float] NULL,
[Sales Units New Style Type] [float] NULL,
[Sales Units New Flavour Fragr] [float] NULL,
[Sales Units New Claim] [float] NULL,
[filename] [nvarchar](260) NULL,
[importdate] [datetime] NULL CONSTRAINT [DF_staging_importdate] DEFAULT (getdate()),
[sCountry] [varchar](50) NULL,
[sChar] [varchar](50) NULL,
[yr] [int] NULL,
[wk] [int] NULL,
[wkno] [int] NULL
) ON [PRIMARY]
GO
SET ANSI_PADDING OFF
GO
YearWeek table structure:
USE [master]
GO
DROP TABLE [dbo].[yearweek]
GO
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
SET ANSI_PADDING ON
GO
CREATE TABLE [dbo].[yearweek](
[yrwk] [varchar](6) NULL,
[yr] [int] NULL,
[wk] [int] NULL,
[RN] [bigint] NULL
) ON [PRIMARY]
GO
SET ANSI_PADDING OFF
GO
The following CTE gives the proper Sales week ranges :
USE MASTER
GO
WITH Salesrows AS
(
SELECT
[SCOUNTRY],
[SCHAR],
[CATEGORY],
[MANUFACTURER],
[BRAND],
[DESCRIPTION],
[EAN],
[SALES VALUE WITH INNOVATION]=IIF([SALES VALUE WITH INNOVATION] IS NULL,0,[SALES VALUE WITH INNOVATION]),
CONVERT(INT, SUBSTRING([WEEK], 8, 2)) Wk,
CONVERT(INT, SUBSTRING([WEEK], 3, 4)) Yr,
[wkno],
ROW_NUMBER() OVER (PARTITION BY [SCOUNTRY],[SCHAR],[CATEGORY],[MANUFACTURER],[BRAND],[DESCRIPTION],[EAN] ORDER BY [WEEK]) RN
FROM STAGING
WHERE ([Level] = 'Item')
)
,SalesRanges as
(
SELECT *,
LAG([SALES VALUE WITH INNOVATION], 1) OVER (PARTITION BY [SCOUNTRY],[SCHAR],[CATEGORY],[MANUFACTURER],[BRAND],[DESCRIPTION],[EAN] ORDER BY RN) L1,
LAG([SALES VALUE WITH INNOVATION], 2) OVER (PARTITION BY [SCOUNTRY],[SCHAR],[CATEGORY],[MANUFACTURER],[BRAND],[DESCRIPTION],[EAN] ORDER BY RN) L2,
LEAD([SALES VALUE WITH INNOVATION], 1) OVER (PARTITION BY [SCOUNTRY],[SCHAR],[CATEGORY],[MANUFACTURER],[BRAND],[DESCRIPTION],[EAN] ORDER BY RN) L5,
LEAD([SALES VALUE WITH INNOVATION], 2) OVER (PARTITION BY [SCOUNTRY],[SCHAR],[CATEGORY],[MANUFACTURER],[BRAND],[DESCRIPTION],[EAN] ORDER BY RN) L6
FROM SalesRows
),
Clearcontents as
(
SELECT *,
(CASE WHEN ISNULL([SALES VALUE WITH INNOVATION], 0) = 0 AND ISNULL(L1,0) = 0 AND ISNULL(L2,0) = 0 THEN 1 ELSE 0 END) RemoveMe0,
(CASE WHEN ISNULL([SALES VALUE WITH INNOVATION], 0) = 0 AND ISNULL(L5,0) = 0 AND ISNULL(L6,0) = 0 THEN 1 ELSE 0 END) RemoveMe1,
(CASE WHEN ISNULL([SALES VALUE WITH INNOVATION], 0) = 0 AND ISNULL(L1,0) = 0 AND L2<>0 AND ISNULL(L5,0) = 0 AND L6<>0 THEN 1 ELSE 0 END) RemoveMe2
FROM SalesRanges
),
CleanedData AS
(
SELECT *,
ROW_NUMBER() OVER (PARTITION BY [SCOUNTRY],[SCHAR],[CATEGORY],[MANUFACTURER],[BRAND],[DESCRIPTION],[EAN] ORDER BY yr, RN) NewRn
FROM ClearContents
WHERE RemoveMe0 != 1 and RemoveMe1 != 1 and RemoveMe2 != 1
),
WeekGaps as
(
SELECT *,
(NewRn - Rn) Ref
FROM CleanedData
),
CorrectWeekPeriods as
(
SELECT
[SCOUNTRY],
[SCHAR],
[CATEGORY],
[MANUFACTURER],
[BRAND],
[DESCRIPTION],
[EAN],
COUNT([wkno]) AS CNTWKS,
MIN([wkno]) AS MINWEEK,
MAX([wkno]) AS MAXWEEK,
REF
FROM WeekGaps
GROUP BY [SCOUNTRY],[SCHAR],[CATEGORY],[MANUFACTURER],[BRAND],[DESCRIPTION],[EAN],[REF]
)
SELECT
C.[SCOUNTRY],
C.[SCHAR],
C.[CATEGORY],
C.[MANUFACTURER],
C.[BRAND],
C.[DESCRIPTION],
C.[EAN],
CONVERT(INT, SUBSTRING(yw1.yrwk ,5,2)) WEEKS,
C.CNTWKS,
yw1.yrwk AS MINWEEK,
yw2.yrwk AS MAXWEEK
FROM CorrectWeekPeriods AS C
INNER JOIN yearweek AS yw1 ON C.MINWEEK = yw1.rn
INNER JOIN yearweek AS yw2 ON C.MAXWEEK = yw2.rn
WHERE (C.CNTWKS > 13) AND (C.CNTWKS <= 52)
AND (C.CNTWKS=(SELECT MAX(A.CNTWKS) FROM CorrectWeekPeriods A WHERE C.[SCOUNTRY]=A.[SCOUNTRY] AND C.[SCHAR]=A.[SCHAR] AND C.[DESCRIPTION]=A.[DESCRIPTION]))
AND SUBSTRING(CAST(yw1.yrwk AS VARCHAR(6)),5,2) >= 1
ORDER BY [EAN],[DESCRIPTION]
1. What fields of CTE do i need to join together to Staging table fields to only have these selective periods rows show in table?
2. I am sure this query can be optimized and made more concise. But how?
3. Also if i **comment** the last **WHERE** clause from the **CorrectWeekPeriods** above, and run the query multiple times, i get **different row counts**. I checked the Execution plan and donot get any errors.
If i just **uncomment** the WHERE clause:
WHERE (C.CNTWKS > 13) AND (C.CNTWKS <= 52)
AND (C.CNTWKS=(SELECT MAX(A.CNTWKS) FROM CorrectWeekPeriods A WHERE C.[SCOUNTRY]=A.
or this one:
WHERE C.Description='0241004245'
i get the proper min & max sales week ranges.
1. Also, if i **uncomment** :
WHERE C.Description='0241004245'
i get the error showing in execution plan:
/*
Missing Index Details from SQL_Correct Gaps.sql - ABC.master (ALPHA\SIFAR (52))
The Query Processor estimates that implementing the following index could improve the query cost by 97.7228%.
*/
/*
USE [master]
GO
CREATE NONCLUSTERED INDEX [<Name of Missing Index, sysname,>]
ON [dbo].[staging] ([Level],[Description])
INCLUDE ([Week],[Sales Value with Innovation],[sCountry],[sChar],[wkno])
GO
*/
But if i keep this last WHERE clause **commented**, i don't get this error. BTW i have already created the above index, so don't know why it is asking me to create same index again. Any reason why this happens?
Also, the last few commented code is the RULES i was trying to create but not able to write the proper code. Here is the rule:
1. if there are 2 or more SKU sales week ranges, then pick up the max one (& better if it starts from Week 1 of 2011).
2. exclude any ranges which are >52, to bring them to <=52.
3. if all SKU sales week ranges are >13 & <=52, then keep only the max one (& better if it starts from week 1 of 2011).
4. exclude any ranges <=13.
Hope somebody can guide me in the right direction (especially my main point 1 to join back to Staging table to extract the appropriate SKU sales week ranges).
Edit...
I just uncommented any of the last WHERE clause again :
WHERE (C.CNTWKS > 13) AND (C.CNTWKS <= 52)
AND (C.CNTWKS=(SELECT MAX(A.CNTWKS) FROM CorrectWeekPeriods A WHERE C.[SCOUNTRY]=A.[SCOUNTRY] AND C.[SCHAR]=A.[SCHAR] AND C.[DESCRIPTION]=A.[DESCRIPTION]))
AND SUBSTRING(CAST(yw1.yrwk AS VARCHAR(6)),5,2) >= 1
and looked at the execution plan. it shows Warnings on SORT & HASH. the warning message is:
Operator used tempdb to spill data during execution with spill level 1
and everytime that i execute the query, i get different count of rows. The query also takes ~1 min to execute. I think its somehow related to the Joins to the **yearweek** table, but dont know how to resolve this issue.
any help would be most appreciated.
August 17, 2014 at 6:28 am
There is definitely some room for improvement here but first can you provide the yearweek population code?
😎
August 17, 2014 at 8:02 am
Eirikur Eiriksson (8/17/2014)
There is definitely some room for improvement here but first can you provide the yearweek population code?😎
Sure. (Courtesy: J Livingston)
/* build (only Once) ============================================================================================================= */
IF EXISTS (
SELECT * FROM sysobjects WHERE id = object_id(N'[dbo].[fnTally]') AND xtype IN (N'FN', N'IF', N'TF')
)
DROP FUNCTION [dbo].[fnTally]
GO
CREATE FUNCTION [dbo].[fnTally]
/**********************************************************************************************************************
Purpose:
Return a column of BIGINTs from @ZeroOrOne up to and including @MaxN with a max value of 1 Billion.
As a performance note, it takes about 00:02:10 (hh:mm:ss) to generate 1 Billion numbers to a throw-away variable.
Usage:
--===== Syntax example (Returns BIGINT)
SELECT t.N
FROM dbo.fnTally(@ZeroOrOne, @MaxN) t
;
Notes:
1. Based on Itzik Ben-Gan's cascading CTE (cCTE) method for creating a "readless" Tally Table source of BIGINTs.
Refer to the following URL for how it works. http://sqlmag.com/sql-server/virtual-auxiliary-table-numbers
2. To start a sequence at 0, @ZeroOrOne must be 0 or NULL. Any other value that's convertable to the BIT data-type
will cause the sequence to start at 1.
3. If @ZeroOrOne = 1 and @MaxN = 0, no rows will be returned.
5. If @MaxN is negative or NULL, a "TOP" error will be returned.
6. @MaxN must be a positive number from >= the value of @ZeroOrOne up to and including 1 Billion. If a larger
number is used, the function will silently truncate after 1 Billion. If you actually need a sequence with
that many values, you should consider using a different tool.
7. There will be a substantial reduction in performance if "N" is sorted in descending order. If a descending
sort is required, use code similar to the following. Performance will decrease by about 27% but it's still
very fast especially compared with just doing a simple descending sort on "N", which is about 20 times slower.
If @ZeroOrOne is a 0, in this case, remove the "+1" from the code.
DECLARE @MaxN BIGINT;
SELECT @MaxN = 1000;
SELECT DescendingN = @MaxN - N + 1
FROM dbo.fnTally(1,@MaxN);
8. There is no performance penalty for sorting "N" in ascending order because the output is explicity sorted by
ROW_NUMBER() OVER (ORDER BY (SELECT NULL))
Revision History:
Rev 00 - Unknown - Jeff Moden
- Initial creation with error handling for @MaxN.
Rev 01 - 09 Feb 2013 - Jeff Moden
- Modified to start at 0 or 1.
Rev 02 - 16 May 2013 - Jeff Moden
- Removed error handling for @MaxN because of exceptional cases.
**********************************************************************************************************************/
(@ZeroOrOne BIT, @MaxN BIGINT)
RETURNS TABLE WITH SCHEMABINDING AS
RETURN WITH
E1(N) AS (SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL
SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1) --10E1 or 10 rows
, E3(N) AS (SELECT 1 FROM E1 a, E1 b, E1 c) --10E3 or 1 Thousand rows
, E9(N) AS (SELECT 1 FROM E3 a, E3 b, E3 c) --10E9 or 1 Billion rows
SELECT N = 0 WHERE ISNULL(@ZeroOrOne,0)= 0 --Conditionally start at 0.
UNION ALL
SELECT TOP(@MaxN) N = ROW_NUMBER() OVER (ORDER BY (SELECT NULL)) FROM E9 -- Values from 1 to @MaxN
;
GO
/*build (only Once) a table of years/weeks by using above Tally function with a sequential number to aid calculations later on*/
USE Master
GO
IF OBJECT_ID('Master..yearweek', 'U') IS NOT NULL DROP TABLE Master..yearweek ;
DECLARE @startyr as INT
DECLARE @no_yrs as INT
SET @startyr = 2010
SET @no_yrs = 20
SELECT
CAST(a.yearno + b.dayno as VARCHAR(6)) AS yrwk, CAST(a.yearno AS INT) AS yr, CAST(b.dayno AS INT) AS wk, ROW_NUMBER() OVER (ORDER BY a.yearno, b.dayno) AS rn
INTO yearweek
FROM
(SELECT CAST(@startyr + N AS VARCHAR) AS yearno FROM dbo.fnTally(0, @no_yrs))AS a
CROSS JOIN
(SELECT (CASE WHEN N < 10 THEN '0' + CAST(N AS VARCHAR) ELSE CAST(N AS VARCHAR) END ) AS dayno FROM dbo.fnTally(1, 52))AS b
/* UPDATE QUERY: once you have the files imported from SSIS into staging table some of the other columns can be updated*/
USE Master
GO
UPDATE staging
SET yr = yw.yr
, wk = yw.wk
, wkno = yw.rn
FROM staging
INNER JOIN yearweek AS yw
ON SUBSTRING(staging.Week, 3, 4) + SUBSTRING(staging.Week, 8, 2) = yw.yrwk
UPDATE staging
SET sCountry = PARSENAME(REPLACE(filename, '_', '.'), 3)
, sChar = PARSENAME(REPLACE(filename, '_', '.'), 2);
August 28, 2014 at 10:09 pm
Here is the first part and the goal of this part is to improve the yearweek table which then allows us to do a direct join to it rather than using string functions in the join definition.
😎
USE TESTDB;
GO
IF EXISTS (SELECT OBJECT_ID(N'dbo.yearweek'))
BEGIN
DROP TABLE dbo.yearweek;
END
/*
First suggestion, change the structure of the dbo.yearweek table, adding
a clustered index, a computed column and finally an unique covering
index for the query.
*/
CREATE TABLE dbo.yearweek(
yrwk VARCHAR(6) NOT NULL CONSTRAINT PK_DBO_YEARWEEK_YRWK PRIMARY KEY CLUSTERED
,yr INT NOT NULL
,wk INT NOT NULL
,rn INT NOT NULL
,weekx AS (CONVERT( VARCHAR(9),((('W '+CONVERT([char](4),[yr]))+' ')
+stuff('00',(3)-len(CONVERT([char](2),[wk]))
,len(CONVERT([char](2),[wk]))
,CONVERT([char](2),[wk]))),1)) PERSISTED
) ON [PRIMARY];
GO
/* Unique covering index */
CREATE UNIQUE INDEX UNQIDX_DBO_YEARWEEK_WEEKX ON dbo.yearweek (weekx ASC)
INCLUDE (yr,wk,rn);
GO
/* Populate the table */
DECLARE @startyr as INT
DECLARE @no_yrs as INT
SET @startyr = 2010
SET @no_yrs = 20
;WITH T(N) AS (SELECT N FROM (VALUES (NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL),(NULL)) AS X(N))
,YEAR_NUMS(N) AS (SELECT TOP(@no_yrs) ROW_NUMBER() OVER (ORDER BY (SELECT NULL)) -1 AS N FROM T T1,T T2, T T3 ,T T4, T T5, T T6, T T7)
,WEEK_NUMS(N) AS (SELECT TOP(52) ROW_NUMBER() OVER (ORDER BY (SELECT NULL)) -1 AS N FROM T T1,T T2, T T3 ,T T4, T T5, T T6, T T7)
INSERT INTO dbo.yearweek (yrwk,yr,wk,rn)
SELECT
CAST(a.yearno + b.dayno as VARCHAR(6)) AS yrwk
,CAST(a.yearno AS INT) AS yr
,CAST(b.dayno AS INT) AS wk
,CONVERT(INT,ROW_NUMBER() OVER (ORDER BY a.yearno, b.dayno),0) AS rn
FROM
(SELECT CAST(@startyr + N AS VARCHAR) AS yearno FROM YEAR_NUMS)AS a
CROSS JOIN
(SELECT (CASE WHEN N < 10 THEN '0' + CAST(N AS VARCHAR) ELSE CAST(N AS VARCHAR) END ) AS dayno FROM WEEK_NUMS)AS b
/* Check the content */
SELECT
*
FROM dbo.yearweek;
BTW: My apologies for the late and piecemeal like answer:-)
September 6, 2014 at 12:53 pm
Here is another addition to the code, identifying the gaps more efficiently.
😎
USE TESTDB;
GO
;WITH BASE_DATA AS
(
SELECT
ST.STG_ID
,ST.[Description] AS SKU
,YK.rn - RANK() OVER
(
PARTITION BY Description
ORDER BY WEEK
) AS WN_OFFSET
,YK.weekx
,YK.rn
FROM dbo.staging ST
INNER JOIN dbo.yearweek YK
ON ST.[Week] = YK.weekx
)
,GROUP_BOUNDRIES AS
(
SELECT
BD.STG_ID
,BD.SKU
,ROW_NUMBER() OVER
(
PARTITION BY BD.SKU,BD.WN_OFFSET
ORDER BY BD.rn ASC
) AS BD_WO_RID
,BD.WN_OFFSET
,BD.weekx
,BD.rn
FROM BASE_DATA BD
)
,GROUP_EDGES AS
(
SELECT
GB.STG_ID
,ROW_NUMBER() OVER
(
ORDER BY GB.STG_ID
) AS GB_RID
,GB.SKU
,GB.BD_WO_RID
,GB.WN_OFFSET
,GB.weekx
,GB.rn
FROM GROUP_BOUNDRIES GB
-- COMMENT OUT THE WHERE STATEMENT
-- TO RETRIEVE ALL RESULTS, THIS
-- ONLY SHOWS THE EDGES
WHERE GB.BD_WO_RID = 1
)
SELECT
GE1.STG_ID
,GE1.GB_RID
,GE1.SKU
,GE1.WN_OFFSET
,GE2.WN_OFFSET - GE1.WN_OFFSET
FROM GROUP_EDGES GE1
LEFT OUTER JOIN GROUP_EDGES GE2
ON GE1.GB_RID = GE2.GB_RID - 1
AND GE1.SKU = GE2.SKU;
Viewing 5 posts - 1 through 4 (of 4 total)
You must be logged in to reply to this topic. Login to reply