May 22, 2023 at 4:15 pm
Hello and thanks in advance if you can help.
I guess its somewhat acceptable in dimensional modeling to use the INT data type when creating a date dimension surrogate key. I can't say I agree with this but here we go.
I need to join tables on DATE and INT columns. What is the most performant way?
DROP TABLE IF EXISTS #ReportDate;
CREATE TABLE #ReportDate (ReportDate DATE NOT NULL);
DROP TABLE IF EXISTS #ReportDateSK;
CREATE TABLE #ReportDateSK (ReportDateSK INT NOT NULL);
--I am not using a tally or calendar table to simplify setup
INSERT INTO #ReportDate
SELECT TOP(DATEDIFF(DAY,'01/01/2000','12/31/2020') + 1)
DATEADD(DAY,ROW_NUMBER() OVER (ORDER BY a.object_id) - 1,CONVERT(DATETIME,'12/31/2020')) AS ReportDate
FROM sys.columns a CROSS JOIN sys.columns b;
INSERT INTO #ReportDateSK
SELECT
CONVERT(INT,CONVERT(NVARCHAR(8),ReportDate,112))
FROM #ReportDate;
--2 potental ways to join
SELECT
*
FROM #ReportDateSK RDSK
INNER JOIN #ReportDate RD
ON RDSK.ReportDateSK = CONVERT(INT,CONVERT(NVARCHAR(8),RD.ReportDate,112));
SELECT
*
FROM #ReportDateSK RDSK
INNER JOIN #ReportDate RD
ON CONVERT(DATE,CONVERT(VARCHAR(8),RDSK.ReportDateSK,101)) = RD.ReportDate;
May 22, 2023 at 4:27 pm
Another option is adding a persisted, calculated column to one of the tables and indexing accordingly.
The absence of evidence is not evidence of absence.
Martin Rees
You can lead a horse to water, but a pencil must be lead.
Stan Laurel
May 22, 2023 at 5:43 pm
Hello and thanks in advance if you can help.
I guess its somewhat acceptable in dimensional modeling to use the INT data type when creating a date dimension surrogate key. I can't say I agree with this but here we go.
I need to join tables on DATE and INT columns. What is the most performant way?
DROP TABLE IF EXISTS #ReportDate;
CREATE TABLE #ReportDate (ReportDate DATE NOT NULL);
DROP TABLE IF EXISTS #ReportDateSK;
CREATE TABLE #ReportDateSK (ReportDateSK INT NOT NULL);
--I am not using a tally or calendar table to simplify setup
INSERT INTO #ReportDate
SELECT TOP(DATEDIFF(DAY,'01/01/2000','12/31/2020') + 1)
DATEADD(DAY,ROW_NUMBER() OVER (ORDER BY a.object_id) - 1,CONVERT(DATETIME,'12/31/2020')) AS ReportDate
FROM sys.columns a CROSS JOIN sys.columns b;
INSERT INTO #ReportDateSK
SELECT
CONVERT(INT,CONVERT(NVARCHAR(8),ReportDate,112))
FROM #ReportDate;
--2 potental ways to join
SELECT
*
FROM #ReportDateSK RDSK
INNER JOIN #ReportDate RD
ON RDSK.ReportDateSK = CONVERT(INT,CONVERT(NVARCHAR(8),RD.ReportDate,112));
SELECT
*
FROM #ReportDateSK RDSK
INNER JOIN #ReportDate RD
ON CONVERT(DATE,CONVERT(VARCHAR(8),RDSK.ReportDateSK,101)) = RD.ReportDate;
if the code above reflects what you are doing at the moment then you need to change it.
and as you didn't really stated how your tables are setup it's hard to state exactly what you should do.
but the main thing is that if you indeed have 2 tables, one of them has a date on the field, and the other has only a int, then pick up whichever is the smallest, create a temp table from it and convert the field on that particular table so it MATCHES the datatype of the other table so you don't have conversions happening during the joins and so indexing (if you do have them on the required on the remaining table) works.
May 22, 2023 at 6:40 pm
Computed persistent column idea. Renaming tables to Staging and Fact to better represent the actual use case.
--Staging table in ETL process
DROP TABLE IF EXISTS #Staging;
CREATE TABLE #Staging (ReportDate DATE NOT NULL);
--Fact table in a dimensional model
DROP TABLE IF EXISTS #Fact;
CREATE TABLE #Fact (ReportDateSK INT NOT NULL);
--I am not using a tally or calendar table to simplify setup
INSERT INTO #Staging
SELECT TOP(DATEDIFF(DAY,'01/01/2000','12/31/2020') + 1)
DATEADD(DAY,ROW_NUMBER() OVER (ORDER BY a.object_id) - 1,CONVERT(DATETIME,'12/31/2020')) AS ReportDate
FROM sys.columns a CROSS JOIN sys.columns b;
INSERT INTO #Fact
SELECT
CONVERT(INT,CONVERT(NVARCHAR(8),ReportDate,112))
FROM #Staging;
ALTER TABLE #Staging ADD ReportDateSK AS (CONVERT(INT,CONVERT(NVARCHAR(8),ReportDate,112)));
SELECT
*
FROM #Fact RDSK
INNER JOIN #Staging RD
ON RDSK.ReportDateSK = RD.ReportDateSK;
May 22, 2023 at 7:00 pm
Alternate way. Use the existing date dimension table as an intermediary which should eliminate all conversions. Sorry to be thinking out loud here....
--Staging table in ETL process
DROP TABLE IF EXISTS #Staging;
CREATE TABLE #Staging (ReportDate DATE NOT NULL);
--Fact table in a dimensional model
DROP TABLE IF EXISTS #Fact;
CREATE TABLE #Fact (ReportDateSK INT NOT NULL);
--Date dimension table
DROP TABLE IF EXISTS #DateDimension;
CREATE TABLE #DateDimension (Date DATE NOT NULL, DateSK INT NOT NULL);
--I am not using a tally or calendar table to simplify setup
INSERT INTO #Staging
SELECT TOP(DATEDIFF(DAY,'01/01/2000','12/31/2020') + 1)
DATEADD(DAY,ROW_NUMBER() OVER (ORDER BY a.object_id) - 1,CONVERT(DATETIME,'12/31/2020')) AS ReportDate
FROM sys.columns a CROSS JOIN sys.columns b;
INSERT INTO #Fact
SELECT
CONVERT(INT,CONVERT(NVARCHAR(8),ReportDate,112))
FROM #Staging;
--My #DateDimension is prepopulated, essentially a calendar table
--But I'll just use my test tables as a convenient way to populate
INSERT INTO #DateDimension
SELECT
ReportDate,ReportDateSK
FROM #Fact F
INNER JOIN #Staging S
ON F.ReportDateSK = CONVERT(INT,CONVERT(NVARCHAR(8),S.ReportDate,112));
--Then use #DateDimension as an intermediate join table eliminating the conversions
SELECT
*
FROM #Fact F
INNER JOIN #DateDimension DD
ON F.ReportDateSK = DD.DateSK
INNER JOIN #Staging S
ON DD.Date = S.ReportDate
Viewing 5 posts - 1 through 4 (of 4 total)
You must be logged in to reply to this topic. Login to reply