Most performant way to join a DATE to an INT

  • Hello and thanks in advance if you can help.

    I guess its somewhat acceptable in dimensional modeling to use the INT data type when creating a date dimension surrogate key. I can't say I agree with this but here we go.

    I need to join tables on DATE and INT columns. What is the most performant way?

    DROP TABLE IF EXISTS #ReportDate; 
    CREATE TABLE #ReportDate (ReportDate DATE NOT NULL);

    DROP TABLE IF EXISTS #ReportDateSK;
    CREATE TABLE #ReportDateSK (ReportDateSK INT NOT NULL);

    --I am not using a tally or calendar table to simplify setup
    INSERT INTO #ReportDate
    SELECT TOP(DATEDIFF(DAY,'01/01/2000','12/31/2020') + 1)
    DATEADD(DAY,ROW_NUMBER() OVER (ORDER BY a.object_id) - 1,CONVERT(DATETIME,'12/31/2020')) AS ReportDate
    FROM sys.columns a CROSS JOIN sys.columns b;

    INSERT INTO #ReportDateSK
    SELECT
    CONVERT(INT,CONVERT(NVARCHAR(8),ReportDate,112))
    FROM #ReportDate;

    --2 potental ways to join
    SELECT
    *
    FROM #ReportDateSK RDSK
    INNER JOIN #ReportDate RD
    ON RDSK.ReportDateSK = CONVERT(INT,CONVERT(NVARCHAR(8),RD.ReportDate,112));

    SELECT
    *
    FROM #ReportDateSK RDSK
    INNER JOIN #ReportDate RD
    ON CONVERT(DATE,CONVERT(VARCHAR(8),RDSK.ReportDateSK,101)) = RD.ReportDate;


    • This topic was modified 1 year, 7 months ago by  Chrissy321. Reason: Removed the primary key. While my date dimension does have this primary key I am usually joining staging, fact or other dimension tables
  • Another option is adding a persisted, calculated column to one of the tables and indexing accordingly.

    The absence of evidence is not evidence of absence.
    Martin Rees

    You can lead a horse to water, but a pencil must be lead.
    Stan Laurel

  • Chrissy321 wrote:

    Hello and thanks in advance if you can help.

    I guess its somewhat acceptable in dimensional modeling to use the INT data type when creating a date dimension surrogate key. I can't say I agree with this but here we go.

    I need to join tables on DATE and INT columns. What is the most performant way?

    DROP TABLE IF EXISTS #ReportDate; 
    CREATE TABLE #ReportDate (ReportDate DATE NOT NULL);

    DROP TABLE IF EXISTS #ReportDateSK;
    CREATE TABLE #ReportDateSK (ReportDateSK INT NOT NULL);

    --I am not using a tally or calendar table to simplify setup
    INSERT INTO #ReportDate
    SELECT TOP(DATEDIFF(DAY,'01/01/2000','12/31/2020') + 1)
    DATEADD(DAY,ROW_NUMBER() OVER (ORDER BY a.object_id) - 1,CONVERT(DATETIME,'12/31/2020')) AS ReportDate
    FROM sys.columns a CROSS JOIN sys.columns b;

    INSERT INTO #ReportDateSK
    SELECT
    CONVERT(INT,CONVERT(NVARCHAR(8),ReportDate,112))
    FROM #ReportDate;

    --2 potental ways to join
    SELECT
    *
    FROM #ReportDateSK RDSK
    INNER JOIN #ReportDate RD
    ON RDSK.ReportDateSK = CONVERT(INT,CONVERT(NVARCHAR(8),RD.ReportDate,112));

    SELECT
    *
    FROM #ReportDateSK RDSK
    INNER JOIN #ReportDate RD
    ON CONVERT(DATE,CONVERT(VARCHAR(8),RDSK.ReportDateSK,101)) = RD.ReportDate;


    if the code above reflects what you are doing at the moment then you need to change it.

    and as you didn't really stated how your tables are setup it's hard to state exactly what you should do.

    but the main thing is that if you indeed have 2 tables, one of them has a date on the field, and the other has only a int, then pick up whichever is the smallest, create a temp table from it and convert the field on that particular table so it MATCHES the datatype of the other table so you don't have conversions happening during the joins and so indexing (if you do have them on the required on the remaining table) works.

  • Computed persistent column idea. Renaming tables to Staging and Fact to better represent the actual use case.

    --Staging table in ETL process
    DROP TABLE IF EXISTS #Staging;
    CREATE TABLE #Staging (ReportDate DATE NOT NULL);

    --Fact table in a dimensional model
    DROP TABLE IF EXISTS #Fact;
    CREATE TABLE #Fact (ReportDateSK INT NOT NULL);

    --I am not using a tally or calendar table to simplify setup
    INSERT INTO #Staging
    SELECT TOP(DATEDIFF(DAY,'01/01/2000','12/31/2020') + 1)
    DATEADD(DAY,ROW_NUMBER() OVER (ORDER BY a.object_id) - 1,CONVERT(DATETIME,'12/31/2020')) AS ReportDate
    FROM sys.columns a CROSS JOIN sys.columns b;

    INSERT INTO #Fact
    SELECT
    CONVERT(INT,CONVERT(NVARCHAR(8),ReportDate,112))
    FROM #Staging;

    ALTER TABLE #Staging ADD ReportDateSK AS (CONVERT(INT,CONVERT(NVARCHAR(8),ReportDate,112)));

    SELECT
    *
    FROM #Fact RDSK
    INNER JOIN #Staging RD
    ON RDSK.ReportDateSK = RD.ReportDateSK;

     

  • Alternate way. Use the existing date dimension table as an intermediary which should eliminate all conversions. Sorry to be thinking out loud here....

    --Staging table in ETL process
    DROP TABLE IF EXISTS #Staging;
    CREATE TABLE #Staging (ReportDate DATE NOT NULL);

    --Fact table in a dimensional model
    DROP TABLE IF EXISTS #Fact;
    CREATE TABLE #Fact (ReportDateSK INT NOT NULL);

    --Date dimension table
    DROP TABLE IF EXISTS #DateDimension;
    CREATE TABLE #DateDimension (Date DATE NOT NULL, DateSK INT NOT NULL);

    --I am not using a tally or calendar table to simplify setup
    INSERT INTO #Staging
    SELECT TOP(DATEDIFF(DAY,'01/01/2000','12/31/2020') + 1)
    DATEADD(DAY,ROW_NUMBER() OVER (ORDER BY a.object_id) - 1,CONVERT(DATETIME,'12/31/2020')) AS ReportDate
    FROM sys.columns a CROSS JOIN sys.columns b;

    INSERT INTO #Fact
    SELECT
    CONVERT(INT,CONVERT(NVARCHAR(8),ReportDate,112))
    FROM #Staging;

    --My #DateDimension is prepopulated, essentially a calendar table
    --But I'll just use my test tables as a convenient way to populate
    INSERT INTO #DateDimension
    SELECT
    ReportDate,ReportDateSK
    FROM #Fact F
    INNER JOIN #Staging S
    ON F.ReportDateSK = CONVERT(INT,CONVERT(NVARCHAR(8),S.ReportDate,112));

    --Then use #DateDimension as an intermediate join table eliminating the conversions
    SELECT
    *
    FROM #Fact F
    INNER JOIN #DateDimension DD
    ON F.ReportDateSK = DD.DateSK
    INNER JOIN #Staging S
    ON DD.Date = S.ReportDate

Viewing 5 posts - 1 through 4 (of 4 total)

You must be logged in to reply to this topic. Login to reply