OK, all done. I had to include the RAND in the wrapper view too, for the same reason.
Here is the full script:
USE tempdb;
-- Drop the in-line function
IF OBJECT_ID(N'dbo.GetSourceSample', N'IF')
IS NOT NULL
DROP FUNCTION dbo.GetSourceSample
GO
-- Drop the view wrapping NEWID
IF OBJECT_ID(N'dbo.Random', N'V')
IS NOT NULL
DROP VIEW dbo.Random;
GO
-- Drop the combinations table
IF OBJECT_ID(N'dbo.Combinations', N'U')
IS NOT NULL
DROP TABLE dbo.Combinations;
GO
-- Drop our test table
IF OBJECT_ID(N'dbo.Source', N'U')
IS NOT NULL
DROP TABLE dbo.Source;
GO
-- Create numbers table if required
IF OBJECT_ID(N'dbo.Number', N'U')
IS NULL
BEGIN
CREATE TABLE dbo.Number (n SMALLINT PRIMARY KEY);
-- 736 SMALLINTs exactly fill one 8K page
INSERT dbo.Number (n)
SELECT TOP (736)
ROW_NUMBER() OVER (ORDER BY (SELECT 0))
FROM master.sys.columns C1,
master.sys.columns C2,
master.sys.columns C3;
END;
GO
-- Create the test table
CREATE TABLE dbo.Source
(
row_id INTEGER IDENTITY PRIMARY KEY,
vehicle_type CHAR(1) NOT NULL,
vehicle_id INTEGER NOT NULL
);
GO
-- View wrapping NEWID
CREATE VIEW dbo.Random
WITH SCHEMABINDING
AS SELECT n = RAND(CHECKSUM(NEWID()));
GO
-- All possible combinations of values [3...19]
-- that sum to 25
SELECT row_id = IDENTITY(INT, 0, 1),
nc = C.n,
nt = T.n,
nr = R.n
INTO dbo.Combinations
FROM dbo.Number C,
dbo.Number T,
dbo.Number R
WHERE C.n BETWEEN 3 AND 19
AND T.n BETWEEN 3 AND 19
AND R.n BETWEEN 3 AND 19
AND C.n + T.n + R.n = 25
-- Create clustered primary key
ALTER TABLE dbo.Combinations
ADD PRIMARY KEY (row_id)
WITH (FILLFACTOR = 100);
GO
-- Add 100,000 random rows, roughly evenly distributed
-- across vehicle types
INSERT dbo.Source
(
vehicle_type,
vehicle_id
)
SELECT TOP (100000)
CASE
-- Chance of being a car
WHEN RAND(CHECKSUM(NEWID(), N1.n)) <= 0.3333 THEN 'C'
-- Chance of being a truck
WHEN RAND(CHECKSUM(NEWID(), N2.n)) <= 0.3333 THEN 'T'
-- Chance of being an RV
ELSE 'R'
END,
-- Random vehicle id
ABS(CHECKSUM(NEWID()))
FROM dbo.Number N1,
dbo.Number N2,
dbo.Number N3;
GO
-- A helpful index
CREATE UNIQUE INDEX [UQ dbo.Source vehicle_type, row_id (vehicle_id)]
ON dbo.Source
(vehicle_type ASC, row_id ASC)
INCLUDE (vehicle_id)
WITH (FILLFACTOR = 100);
GO
-- Show the number of vehicles of each type
SELECT vehicle_type,
row_count = COUNT_BIG(*)
FROM dbo.Source
GROUP BY
vehicle_type
ORDER BY
vehicle_type;
GO
-- Create the function
CREATE FUNCTION dbo.GetSourceSample ()
RETURNS TABLE
WITH SCHEMABINDING
AS RETURN
WITH SelectionCounts
AS (
-- Number of rows to select from each group
-- Minimum 3, total for all groups = 25
-- Choose one of the possible combinations at random
SELECT TOP (1)
nc = C.n,
nt = T.n,
nr = R.n
FROM dbo.Number C,
dbo.Number T,
dbo.Number R
WHERE C.n BETWEEN 3 AND 19
AND T.n BETWEEN 3 AND 19
AND R.n BETWEEN 3 AND 19
AND C.n + T.n + R.n = 25
ORDER BY
(SELECT R.n FROM dbo.Random R)
),
GroupCounts
AS (
--Find the total number of vehicles of each type
SELECT vehicle_type,
group_size = COUNT_BIG(*)
FROM dbo.Source
WHERE vehicle_type = 'C'
GROUP BY
vehicle_type
UNION ALL
SELECT vehicle_type,
group_size = COUNT_BIG(*)
FROM dbo.Source
WHERE vehicle_type = 'T'
GROUP BY
vehicle_type
UNION ALL
SELECT vehicle_type,
group_size = COUNT_BIG(*)
FROM dbo.Source
WHERE vehicle_type = 'R'
GROUP BY
vehicle_type
),
Parameters
AS (
SELECT GC.group_size,
GC.vehicle_type,
sample_size =
CASE GC.vehicle_type
WHEN 'C' THEN SC.nc
WHEN 'T' THEN SC.nt
WHEN 'R' THEN SC.nr
ELSE 0
END
FROM GroupCounts GC,
SelectionCounts SC
),
RandomRows
AS (
-- Generate a random row number for each row required from
-- each group. Maximum rows returned = 25 * group_count
--
-- The TOP (9223372036854775807 (= BIGINT.Max)) is just to
-- force the query plan to calculate the random row number
-- before the final join
SELECT TOP (9223372036854775807)
P.vehicle_type,
Random.rn
FROM Parameters P
CROSS
APPLY (
SELECT TOP (P.sample_size)
rn = CONVERT(BIGINT, R.n * P.group_size + 1)
FROM dbo.Number N,
dbo.Random R
ORDER BY
N.n ASC
) Random
ORDER BY
P.vehicle_type,
Random.rn
),
NumberedSourceRows
AS (
-- Number the source rows, restarting the numbering for each group
SELECT S.row_id,
S.vehicle_type,
S.vehicle_id,
rn = ROW_NUMBER() OVER (PARTITION BY S.vehicle_type ORDER BY S.row_id ASC)
FROM dbo.Source S
)
-- Fetch the numbered rows that match the random row number, per group
SELECT NSR.row_id,
NSR.vehicle_type,
NSR.vehicle_id
FROM RandomRows RR
JOIN NumberedSourceRows NSR
ON NSR.vehicle_type = RR.vehicle_type
AND NSR.rn = RR.rn;
GO
SET STATISTICS TIME ON;
SELECT GSS.row_id,
GSS.vehicle_type,
GSS.vehicle_id
FROM dbo.GetSourceSample() GSS;
SET STATISTICS TIME OFF;
GO
-- Drop the in-line function
IF OBJECT_ID(N'dbo.GetSourceSample', N'IF')
IS NOT NULL
DROP FUNCTION dbo.GetSourceSample
-- Drop the view wrapping NEWID
IF OBJECT_ID(N'dbo.Random', N'V')
IS NOT NULL
DROP VIEW dbo.Random;
-- Drop the combinations table
IF OBJECT_ID(N'dbo.Combinations', N'U')
IS NOT NULL
DROP TABLE dbo.Combinations;
-- Drop our test table
IF OBJECT_ID(N'dbo.Source', N'U')
IS NOT NULL
DROP TABLE dbo.Source;
Plan stayed exactly the same, 94ms average CPU time.