Find run of repeating value

Question

Find run of repeating value

Viewing 9 posts - 16 through 23 (of 23 total)

You must be logged in to reply to this topic. Login to reply

kuopaz Default port Points: 1449 More actions · Answer 1

My solution, no claim to originality.

declare @AtLeastCount int = 3

;with T as
(
select *, RowNum = row_number() over(order by ID)
from @T
),
T1 as
(
select *, RowGroup = RowNum - row_number() over (order by RowNum)
from T
where Val = 0
),
T2 as
(
select *, RowGroup = RowNum - row_number() over (order by RowNum)
from T
where Val = 1
),
T3 as
(
select 
Val = min(Val), 
FirstID = min(ID),
LastID = max(ID),
CountID = count(ID)
from T1
group by
RowGroup
union all
select 
Val = min(Val), 
FirstID = min(ID), 
LastID = max(ID),
CountID = count(ID)
from T2
group by
RowGroup
),
T4 as
(
select *, 
BeforeID = lag(LastID) over(order by LastID), 
BeforeVal = lag(Val) over(order by LastID), 
AfterID = lead(FirstID) over(order by FirstID),
AfterVal = lead(Val) over(order by FirstID)
from T3
)

--unpivot
select 
X.PosDec,
X.ID,
X.Val
from 
T4
cross apply
(
values
('First', FirstID, Val),
('Last', LastID, Val),
('Before', BeforeID, BeforeVal),
('After', AfterID, AfterVal) 
) X(PosDec, ID, Val)
where
T4.Val = 1
and T4.CountID >= @AtLeastCount
and X.ID is not null
order by
X.ID

kuopaz Default port Points: 1449 More actions · Answer 2

And without the UNION ALL.

declare @AtLeastCount int = 3

;with T as
(
select *, 
RowNum0 = case when Val = 0 then row_number() over(order by ID) else null end,
RowNum1 = case when Val = 1 then row_number() over(order by ID) else null end
from @T
),
T1 as
(
select *, 
RowGroup0 = case when Val = 0 then RowNum0 - row_number() over (order by RowNum0) else null end,
RowGroup1 = case when Val = 1 then RowNum1 - row_number() over (order by RowNum1) else null end
from T
),
T2 as
(
select 
Val = min(Val), 
FirstID = min(ID),
LastID = max(ID),
CountID = count(ID)
from T1
group by
coalesce(RowGroup0, RowGroup1)
),
T3 as
(
select *, 
BeforeID = lag(LastID) over(order by LastID), 
BeforeVal = lag(Val) over(order by LastID), 
AfterID = lead(FirstID) over(order by FirstID),
AfterVal = lead(Val) over(order by FirstID)
from T2
)

--unpivot
select 
X.PosDec,
X.ID,
X.Val
from 
T3
cross apply
(
values
('First', FirstID, Val),
('Last', LastID, Val),
('Before', BeforeID, BeforeVal),
('After', AfterID, AfterVal) 
) X(PosDec, ID, Val)
where
T3.Val = 1
and T3.CountID >= @AtLeastCount
and X.ID is not null
order by
X.ID

kuopaz Default port Points: 1449 More actions · Answer 3

In my previous attempts, I don't think I can guarantee that RowGroup is unique across the two 'paths', and so could get incorrect result when combined. Would depend on the test data? So my next version...

declare @AtLeastCount int = 3

;with T as
(
select *, 
RowNum = row_number() over(order by ID),
BeforeID = lag(ID) over(order by ID), --to use later
BeforeVal = lag(Val) over(order by ID),
AfterID = lead(ID) over(order by ID),
AfterVal = lead(Val) over(order by ID)
from @T
),
T1 as
(
select *,
RowGroup = RowNum - row_number() over (order by RowNum) --create groups for islands
from T
where 
Val = 1 --important is here
),
T2 as
(
select
Val = min(Val), 
FirstID = min(ID),
LastID = max(ID),
CountID = count(ID)
from T1
group by 
RowGroup
),
T3 as
(
select 
T2.Val, 
T2.FirstID,
T2.LastID, 
[Before].BeforeID, 
[Before].BeforeVal, 
[After].AfterID, 
[After].AfterVal
from 
T2
inner join T as [Before] on --look back
T2.FirstID = [Before].ID
inner join T as [After] on
T2.LastID = [After].ID
where
CountID >= @AtLeastCount
)

--unpivot
select 
X.PosDec,
X.ID,
X.Val
from 
T3
cross apply
(
values
('First', FirstID, Val),
('Last', LastID, Val),
('Before', BeforeID, BeforeVal),
('After', AfterID, AfterVal) 
) X(PosDec, ID, Val)
where
X.ID is not null
order by
X.ID

vliet SSCommitted Points: 1998 More actions · Answer 4

Hello kuopaz, the following elegant solution uses only windowed sums to obtain the desired result:

/* INPUT TEST DATA (from original post) */
declare @T table(ID int not null primary key, Val int not null)

insert @T(ID, Val)
select 1, 0 union all
select 10, 1 union all
select 17, 0 union all
select 18, 0 union all
select 27, 1 union all
select 34, 1 union all
select 42, 1 union all
select 52, 0 union all
select 62, 1 union all
select 63, 1 union all
select 65, 0 union all
select 73, 0 union all
select 79, 1 union all
select 85, 1 union all
select 89, 1 union all
select 90, 1

/* Original test data with a proper ORDER BY clause */
select * from
  @T
order by
  ID;

/*
-- An answer to the original question finding runs of at least 3 subsequent 1s
-- Each original row appears at most once in the result set
--
--      Remark: it is possible that a row with [Val] = 0 is both 'Before'
--              and 'After' a run, hence the addition of 'Between'
--    Required: sums of the preceding and following 1, N (3) and N - 1 (2) values
--  Assumption: [Val] is always either 0 or 1
-- Alternative: if [Val] may have other values as well, use
--              sum(case when Val = 1 then 1 else 0 end) instead of
--              sum(Val) in each windowed aggregate
*/
select * from
( select
    case
      when Val = 0 and Preceding3 < 3 and Following3 = 3 then 'Before'
      when Val = 0 and Preceding3 = 3 and Following3 < 3 then 'After'
      when Val = 0 and Preceding3 = 3 and Following3 = 3 then 'Between'
      when Val = 1 and Preceding1 = 1 and Following2 = 3 then 'First'
      when Val = 1 and Preceding2 = 3 and Following1 = 1 then 'Last'
    end as PosDesc
  , ID
  , Val
  from
  ( select
      ID
    , Val
    , sum(Val) over (order by ID rows between current row and 1 following) as Following1
    , sum(Val) over (order by ID rows between current row and 2 following) as Following2
    , sum(Val) over (order by ID rows between current row and 3 following) as Following3
    , sum(Val) over (order by ID rows between 1 preceding and current row) as Preceding1
    , sum(Val) over (order by ID rows between 2 preceding and current row) as Preceding2
    , sum(Val) over (order by ID rows between 3 preceding and current row) as Preceding3
    from
      @T
  ) as W
) as R
where
  PosDesc is not null
order by
  ID;

/*
-- An answer to an alternative question finding runs of at least 4 subsequent 1s
-- Each original row appears at most once in the result set
--
--      Remark: it is possible that a row with [Val] = 0 is both 'Before'
--              and 'After' a run, hence the addition of 'Between'
--    Required: sums of the preceding and following 1, N (4) and N - 1 (3) values
--  Assumption: [Val] is always either 0 or 1
-- Alternative: if [Val] may have other values as well, use
--              sum(case when Val = 1 then 1 else 0 end) instead of
--              sum(Val) in each windowed aggregate
*/
select * from
( select
    case
      when Val = 0 and Preceding4 < 4 and Following4 = 4 then 'Before'
      when Val = 0 and Preceding4 = 4 and Following4 < 4 then 'After'
      when Val = 0 and Preceding4 = 4 and Following4 = 4 then 'Between'
      when Val = 1 and Preceding1 = 1 and Following3 = 4 then 'First'
      when Val = 1 and Preceding3 = 4 and Following1 = 1 then 'Last'
    end as PosDesc
  , ID
  , Val
  from
  ( select
      ID
    , Val
    , sum(Val) over (order by ID rows between current row and 1 following) as Following1
    , sum(Val) over (order by ID rows between current row and 3 following) as Following3
    , sum(Val) over (order by ID rows between current row and 4 following) as Following4
    , sum(Val) over (order by ID rows between 1 preceding and current row) as Preceding1
    , sum(Val) over (order by ID rows between 3 preceding and current row) as Preceding3
    , sum(Val) over (order by ID rows between 4 preceding and current row) as Preceding4
    from
      @T
  ) as W
) as R
where
  PosDesc is not null
order by
  ID;

It can not use the run length as a parameter, because the window specification requires a literal value.

But for every given run length a solution exists using only six windowed aggregates, as shown above.

This reply was modified 4 years, 2 months ago by vliet.
This reply was modified 4 years, 2 months ago by vliet.
This reply was modified 4 years, 2 months ago by vliet.
This reply was modified 4 years, 2 months ago by vliet.
This reply was modified 4 years, 2 months ago by vliet. Reason: Formatting code for better readability
This reply was modified 4 years, 2 months ago by vliet. Reason: A small error in the last comment section
This reply was modified 4 years, 2 months ago by vliet. Reason: Using lower case keywords to improve coloring
This reply was modified 4 years, 2 months ago by vliet. Reason: Another small error correction and using lower case in the SQL parts of the last comment section
This reply was modified 4 years, 2 months ago by vliet.

ScottPletcher SSC Guru Points: 101127 More actions · Answer 5

vliet wrote:

It can not use the run length as a parameter, because the window specification requires a literal value.
But for every given run length a solution exists using only six windowed aggregates, as shown above.

So, how do you identity sequences of, say, 8 1s rather than 3/4, with "only six windowed aggregates"?

SQL DBA,SQL Server MVP(07, 08, 09) "It's a dog-eat-dog world, and I'm wearing Milk-Bone underwear." "Norm", on "Cheers". Also from "Cheers", from "Carla": "You need to know 3 things about Tortelli men: Tortelli men draw women like flies; Tortelli men treat women like flies; Tortelli men's brains are in their flies".

vliet SSCommitted Points: 1998 More actions · Answer 6

To identify runs of eight 1s you can use the following select statement:

select * from
( select
    case
      when Val = 0 and Preceding8 < 8 and Following8 = 8 then 'Before'
      when Val = 0 and Preceding8 = 8 and Following8 < 8 then 'After'
      when Val = 0 and Preceding8 = 8 and Following8 = 8 then 'Between'
      when Val = 1 and Preceding1 = 1 and Following7 = 8 then 'First'
      when Val = 1 and Preceding7 = 8 and Following1 = 1 then 'Last'
    end as PosDesc
  , ID
  , Val
  from
  ( select
      ID
    , Val
    , sum(Val) over (order by ID rows between current row and 1 following) as Following1
    , sum(Val) over (order by ID rows between current row and 7 following) as Following7
    , sum(Val) over (order by ID rows between current row and 8 following) as Following8
    , sum(Val) over (order by ID rows between 1 preceding and current row) as Preceding1
    , sum(Val) over (order by ID rows between 7 preceding and current row) as Preceding7
    , sum(Val) over (order by ID rows between 8 preceding and current row) as Preceding8
    from
      @T
  ) as W
) as R
where
  PosDesc is not null
order by
  ID;

kuopaz Default port Points: 1449 More actions · Answer 7

Thanks vliet. It is elegant but having to use a literal value in the windowing frame is a big limitation.

The islands method above also uses six windowing functions - two lag, two lead, two row_number.

Jacob Wilkins One Orange Chip Points: 27976 More actions · Answer 8

As long as we're fixated on the number of windowed functions (I'd personally prefer choosing based on overall performance profile over a wide variety of data sets, but c'est la vie), I guess I should point out that my first solution uses exactly 2 windowed functions, and can easily take the required number of consecutive 1s as a parameter. 🙂

Cheers!

kuopaz Default port Points: 1449 More actions · Answer 9

kuopaz

Default port

Points: 1449

January 15, 2021 at 10:31 am

#3833716

No, it's well enough defined.