Running total over partition order

  • Hi,

    I need to find the running total for each group order by date.
    So the query is select......., sum(colA) OVER (PARTITION BY ColB, ColC, ColD ORDER BY orderdate) .
    The data looks good except for one scenario. There is no rule that we should have the data for every day and for every criteria.
    So, for April 16th, we don't have data for colB = 'A' and ColC = 'B'.
    For April 18th - running total should be 23 (15+8). But my query is returning 8 because of missing data on April 16th.

  • pjrpjr7 - Sunday, September 30, 2018 5:16 PM

    Hi,

    I need to find the running total for each group order by date.
    So the query is select......., sum(colA) OVER (PARTITION BY ColB, ColC, ColD ORDER BY orderdate) .
    The data looks good except for one scenario. There is no rule that we should have the data for every day and for every criteria.
    So, for April 16th, we don't have data for colB = 'A' and ColC = 'B'.
    For April 18th - running total should be 23 (15+8). But my query is returning 8 because of missing data on April 16th.

    Your query is partition by column is incorrect.


    CREATE TABLE summation
    (
    eff_date date,
    cola int,
    colb varchar(30),
    colc varchar(30)
    )

    INSERT INTO summation VALUES ('4/10/2018',20,'A','B');
    INSERT INTO summation VALUES ('4/10/2018',15,'C','D');
    INSERT INTO summation VALUES ('4/16/2018',5,'A','B');
    INSERT INTO summation VALUES ('4/18/2018',10,'A','B');
    INSERT INTO summation VALUES ('4/18/2018',8,'C','D');

    SELECT EFF_DATE,COLA,ColB, ColC,
    sum(colA) OVER (PARTITION BY ColB, ColC order by eff_Date) as running_sum
    FROM SUMMATION

    Saravanan

  • saravanatn - Monday, October 1, 2018 2:28 AM

    pjrpjr7 - Sunday, September 30, 2018 5:16 PM

    Hi,

    I need to find the running total for each group order by date.
    So the query is select......., sum(colA) OVER (PARTITION BY ColB, ColC, ColD ORDER BY orderdate) .
    The data looks good except for one scenario. There is no rule that we should have the data for every day and for every criteria.
    So, for April 16th, we don't have data for colB = 'A' and ColC = 'B'.
    For April 18th - running total should be 23 (15+8). But my query is returning 8 because of missing data on April 16th.

    Your query is partition by column is incorrect.


    CREATE TABLE summation
    (
    eff_date date,
    cola int,
    colb varchar(30),
    colc varchar(30)
    )

    INSERT INTO summation VALUES ('4/10/2018',20,'A','B');
    INSERT INTO summation VALUES ('4/10/2018',15,'C','D');
    INSERT INTO summation VALUES ('4/16/2018',5,'A','B');
    INSERT INTO summation VALUES ('4/18/2018',10,'A','B');
    INSERT INTO summation VALUES ('4/18/2018',8,'C','D');

    SELECT EFF_DATE,COLA,ColB, ColC,
    sum(colA) OVER (PARTITION BY ColB, ColC order by eff_Date) as running_sum
    FROM SUMMATION

    Strongly suggest that you do not rely on the defaults for the ROW/RANGE window frame specification, or in fact any defaults for the OVER clause.
    😎

    Example with the full window frame specification

    USE TEEST;
    GO
    SET NOCOUNT ON;

    ;WITH SAMPLE_DATA([Date],ColA,ColB,ColC) AS
    (
      SELECT
       CONVERT(DATE,X.[Date],101) AS [Date]
       ,X.ColA
       ,X.ColB
       ,X.ColC
      FROM
      ( 
       VALUES
       ('4/10/2018',20,'A','B')
       ,('4/10/2018',15,'C','D')
       ,('4/16/2018',5,'A','B')
       ,('4/18/2018',10,'A','B')
       ,('4/18/2018',8,'C','D')
      )X([Date],ColA,ColB,ColC)
    )
    SELECT
      SD.[Date]
      ,SD.ColA
      ,SD.ColB
      ,SD.ColC
      ,SUM(SD.ColA) OVER
       (
        PARTITION BY SD.ColB
                    ,SD.ColC
        ORDER BY     SD.[Date] ASC
        ROWS BETWEEN UNBOUNDED PRECEDING
             AND     CURRENT ROW
       ) AS RTOTAL
    FROM SAMPLE_DATA  SD;


  • Eirikur Eiriksson - Monday, October 1, 2018 5:09 AM

    saravanatn - Monday, October 1, 2018 2:28 AM

    pjrpjr7 - Sunday, September 30, 2018 5:16 PM

    Hi,

    I need to find the running total for each group order by date.
    So the query is select......., sum(colA) OVER (PARTITION BY ColB, ColC, ColD ORDER BY orderdate) .
    The data looks good except for one scenario. There is no rule that we should have the data for every day and for every criteria.
    So, for April 16th, we don't have data for colB = 'A' and ColC = 'B'.
    For April 18th - running total should be 23 (15+8). But my query is returning 8 because of missing data on April 16th.

    Your query is partition by column is incorrect.


    CREATE TABLE summation
    (
    eff_date date,
    cola int,
    colb varchar(30),
    colc varchar(30)
    )

    INSERT INTO summation VALUES ('4/10/2018',20,'A','B');
    INSERT INTO summation VALUES ('4/10/2018',15,'C','D');
    INSERT INTO summation VALUES ('4/16/2018',5,'A','B');
    INSERT INTO summation VALUES ('4/18/2018',10,'A','B');
    INSERT INTO summation VALUES ('4/18/2018',8,'C','D');

    SELECT EFF_DATE,COLA,ColB, ColC,
    sum(colA) OVER (PARTITION BY ColB, ColC order by eff_Date) as running_sum
    FROM SUMMATION

    Strongly suggest that you do not rely on the defaults for the ROW/RANGE window frame specification, or in fact any defaults for the OVER clause.
    😎

    Example with the full window frame specification

    USE TEEST;
    GO
    SET NOCOUNT ON;

    ;WITH SAMPLE_DATA([Date],ColA,ColB,ColC) AS
    (
      SELECT
       CONVERT(DATE,X.[Date],101) AS [Date]
       ,X.ColA
       ,X.ColB
       ,X.ColC
      FROM
      ( 
       VALUES
       ('4/10/2018',20,'A','B')
       ,('4/10/2018',15,'C','D')
       ,('4/16/2018',5,'A','B')
       ,('4/18/2018',10,'A','B')
       ,('4/18/2018',8,'C','D')
      )X([Date],ColA,ColB,ColC)
    )
    SELECT
      SD.[Date]
      ,SD.ColA
      ,SD.ColB
      ,SD.ColC
      ,SUM(SD.ColA) OVER
       (
        PARTITION BY SD.ColB
                    ,SD.ColC
        ORDER BY     SD.[Date] ASC
        ROWS BETWEEN UNBOUNDED PRECEDING
             AND     CURRENT ROW
       ) AS RTOTAL
    FROM SAMPLE_DATA  SD;


    Sure Erikur. I think you explained  the same concept to someone before. But OP want it using partition by and stated it was not working. So used it.

    Saravanan

  • saravanatn - Monday, October 1, 2018 6:24 AM

    Sure Erikur. I think you explained  the same concept to someone before. But OP want it using partition by and stated it was not working. So used it.

    The difference in performance is almost tenfold, using ROWS is much faster than RANGE, so I would be careful posting such code without a warning.
    😎

    Simple test harness

    USE TEEST
    GO
    SET NOCOUNT ON;

    --/* -- UNCOMMENT THIS LINE TO SKIP THE DATA SET CREATION.
    DECLARE @SAMPLE_SIZE  INT = 1000000;
    DECLARE @ACCOUNT_COUNT INT =  1000;
    DECLARE @VARIANCE   INT =  1024;
    DECLARE @DAY_RANGE  INT =  700;
    DECLARE @GROUP_COUNT  INT =   64;

    IF OBJECT_ID(N'dbo.TBL_RT_TEST') IS NOT NULL DROP TABLE dbo.TBL_RT_TEST;

    ;WITH T(N) AS (SELECT N FROM (VALUES (0),(0),(0),(0),(0),(0),(0),(0),(0),(0)) AS X(N))
    , NUMS(N) AS (SELECT TOP (@SAMPLE_SIZE) ROW_NUMBER() OVER (ORDER BY @@VERSION) AS N FROM T T1,T T2,T T3,T T4,T T5,T T6,T T7,T T8,T T9)
    SELECT
      ISNULL(NM.N,0)          AS TRAN_ID
     ,ISNULL(NM.N % @ACCOUNT_COUNT,0)     AS ACC_ID
     ,ISNULL(NM.N % @GROUP_COUNT,0)     AS GRP_ID
     ,CHECKSUM(NEWID()) % @VARIANCE     AS AMNT
     ,CONVERT(DATE,DATEADD(DAY,CHECKSUM(NEWID()) % @DAY_RANGE,GETDATE()),0) AS TRAN_DATE
    INTO dbo.TBL_RT_TEST
    FROM NUMS NM;

    ALTER TABLE dbo.TBL_RT_TEST ADD CONSTRAINT PK_DBO_RT_TEST_TRAN_ID PRIMARY KEY CLUSTERED (TRAN_ID ASC);

    CREATE NONCLUSTERED INDEX NCLIDX_DBO_RT_TEST_ACC_ID_GRP_ID_TRAN_DATE_INCL_AMNT_TRAN_ID ON dbo.TBL_RT_TEST
    (
      ACC_ID  ASC
     ,GRP_ID  ASC
     ,TRAN_DATE ASC
    )
    INCLUDE
    (
      AMNT
     ,TRAN_ID
    );

    CREATE NONCLUSTERED INDEX NCLIDX_DBO_RT_TEST_ACC_ID_TRAN_DATE_INCL_AMNT_TRAN_ID ON dbo.TBL_RT_TEST
    (
      ACC_ID  ASC
     ,TRAN_DATE ASC
    )
    INCLUDE
    (
      AMNT
     ,TRAN_ID
    );
    -- */
    DECLARE @BIGINT_BUCKET BIGINT = 0;
    DECLARE @DATE_BUCKET  DATE  = '19000101';
    DECLARE @INT_BUCKET  INT  = 0;
    DECLARE @timer TABLE (T_TXT VARCHAR(50) NOT NULL, T_DT DATETIME2(7) NOT NULL DEFAULT(SYSDATETIME()));

    INSERT INTO @timer(T_TXT) VALUES('DRY RUN')
    SELECT
      @BIGINT_BUCKET = RT.ACC_ID
     ,@BIGINT_BUCKET = RT.GRP_ID
     ,@BIGINT_BUCKET = RT.TRAN_ID
     ,@DATE_BUCKET = RT.TRAN_DATE
     ,@INT_BUCKET  = RT.AMNT
    FROM dbo.TBL_RT_TEST RT;
    INSERT INTO @timer(T_TXT) VALUES('DRY RUN')

    INSERT INTO @timer(T_TXT) VALUES('ROWS')
    SELECT
      @BIGINT_BUCKET = RT.ACC_ID
     ,@BIGINT_BUCKET = RT.GRP_ID
     ,@BIGINT_BUCKET = RT.TRAN_ID
     ,@DATE_BUCKET = RT.TRAN_DATE
     ,@INT_BUCKET  = RT.AMNT
     ,@INT_BUCKET  = SUM(RT.AMNT) OVER
       (
        PARTITION BY RT.ACC_ID
            ,RT.GRP_ID
        ORDER BY  RT.TRAN_DATE
        ROWS BETWEEN UNBOUNDED PRECEDING
          AND  CURRENT ROW
       )
    FROM dbo.TBL_RT_TEST RT;
    INSERT INTO @timer(T_TXT) VALUES('ROWS')

    INSERT INTO @timer(T_TXT) VALUES('RANGE')
    SELECT
      @BIGINT_BUCKET = RT.ACC_ID
     ,@BIGINT_BUCKET = RT.GRP_ID
     ,@BIGINT_BUCKET = RT.TRAN_ID
     ,@DATE_BUCKET = RT.TRAN_DATE
     ,@INT_BUCKET  = RT.AMNT
     ,@INT_BUCKET  = SUM(RT.AMNT) OVER
       (
        PARTITION BY RT.ACC_ID
            ,RT.GRP_ID
        ORDER BY  RT.TRAN_DATE
        RANGE BETWEEN UNBOUNDED PRECEDING
          AND  CURRENT ROW
       )
    FROM dbo.TBL_RT_TEST RT;
    INSERT INTO @timer(T_TXT) VALUES('RANGE')

    INSERT INTO @timer(T_TXT) VALUES('NONE')
    SELECT
      @BIGINT_BUCKET = RT.ACC_ID
     ,@BIGINT_BUCKET = RT.GRP_ID
     ,@BIGINT_BUCKET = RT.TRAN_ID
     ,@DATE_BUCKET = RT.TRAN_DATE
     ,@INT_BUCKET  = RT.AMNT
     ,@INT_BUCKET  = SUM(RT.AMNT) OVER
       (
        PARTITION BY RT.ACC_ID
            ,RT.GRP_ID
        ORDER BY  RT.TRAN_DATE
       )
    FROM dbo.TBL_RT_TEST RT;
    INSERT INTO @timer(T_TXT) VALUES('NONE')

    SELECT
      T.T_TXT
     ,DATEDIFF(MICROSECOND,MIN(T.T_DT),MAX(T.T_DT)) AS DURATION
    FROM @timer T
    GROUP BY T.T_TXT
    ORDER BY DURATION ASC;

    Results

    T_TXT       DURATION
    -------- -----------
    DRY RUN      262188
    ROWS        1838282
    NONE       13262407
    RANGE      13365477

Viewing 5 posts - 1 through 4 (of 4 total)

You must be logged in to reply to this topic. Login to reply