May 12, 2010 at 5:16 pm
I have a table that logs the locations of gps devices based on time. There are 10,000 devices tracked and each device records its location every minute so the table will have millions of rows. Since the table will be receiving data constantly, the clustered index is set on date and the ID of the gps device.
However, there is a business requirement to query the table based on a certain time span and determine the most recent location of all devices within that range. So I also have a non-clustered index ordered on gps device ID and the date.
I've created a sample table below with the code to load 3 million rows.
-- Table with log of gps device pings
IF OBJECT_ID('gps_ping') IS NOT NULL DROP TABLE gps_ping
CREATE TABLE gps_ping (
ping_time DATETIME2 NOT NULL ,
device_id INT NOT NULL,
device_location GEOGRAPHY NOT NULL,
CONSTRAINT PK_GPS_PING PRIMARY KEY (ping_time, device_id)
)
-- Create an index on the table
CREATE INDEX IX_DEVICE ON gps_ping (device_id, ping_time)
DECLARE @g GEOGRAPHY
SET @g = geography::STPointFromText('POINT(-100 30)',4326)
-- Load table with 3,000,000 rows - 10,000 devices which ping every minute for 300 minutes
-- Takes about 1 minute to load
INSERT gps_ping (
ping_time,
device_id,
device_location
)
SELECT
ping_time,
device_id,
@g
FROM (
SELECT DATEADD(mi, rowNum, '1/1/2010') AS ping_time
FROM (
SELECT TOP 300 ROW_NUMBER() OVER (ORDER BY (SELECT 0)) AS rowNum
FROM sys.all_objects
) r
) p
CROSS JOIN (
SELECT TOP 10000 ROW_NUMBER() OVER (ORDER BY (SELECT 0)) AS device_id
FROM sys.all_objects s1 CROSS JOIN sys.all_objects s2
) d
When I try to get the most maximum ping_time of every device, SQL Server will not use the non-clustered index but does a clustered index seek instead. Even if the date range covers the entire table.
DBCC FREEPROCCACHE
DBCC DROPCLEANBUFFERS
DECLARE
@startDate DATETIME2,
@endDate DATETIME2
SELECT
@startDate = '1/1/2010 00:00:00',
@endDate = '1/1/2010 05:00:00'
SELECT
device_id,
MAX(ping_time) AS ping_time
FROM gps_ping
WHERE ping_time BETWEEN @startDate AND @endDate
GROUP BY
device_id
Here is the query plan:
<?xml version="1.0" encoding="utf-16"?>
<ShowPlanXML xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" Version="1.1" Build="10.0.2531.0" xmlns="http://schemas.microsoft.com/sqlserver/2004/07/showplan">
<BatchSequence>
<Batch>
<Statements>
<StmtSimple StatementCompId="1" StatementId="1" StatementText="DBCC FREEPROCCACHE " StatementType="DBCC" />
</Statements>
<Statements>
<StmtSimple StatementCompId="2" StatementId="2" StatementText=" DBCC DROPCLEANBUFFERS " StatementType="DBCC" />
</Statements>
<Statements>
<StmtSimple StatementCompId="3" StatementId="3" StatementText=" DECLARE @startDate DATETIME2, @endDate DATETIME2 SELECT @startDate = '1/1/2010 00:00:00', @endDate = '1/1/2010 05:00:00' " StatementType="ASSIGN" />
</Statements>
<Statements>
<StmtSimple StatementCompId="4" StatementEstRows="10000" StatementId="4" StatementOptmLevel="FULL" StatementSubTreeCost="2.88526" StatementText=" SELECT device_id, MAX(ping_time) AS ping_time FROM gps_ping WHERE ping_time BETWEEN @startDate AND @endDate GROUP BY device_id " StatementType="SELECT" QueryHash="0x74CC0F4A9CB53649" QueryPlanHash="0xD810AB20BB08F0E6">
<StatementSetOptions ANSI_NULLS="true" ANSI_PADDING="true" ANSI_WARNINGS="true" ARITHABORT="true" CONCAT_NULL_YIELDS_NULL="true" NUMERIC_ROUNDABORT="false" QUOTED_IDENTIFIER="true" />
<QueryPlan CachedPlanSize="16" CompileTime="3" CompileCPU="3" CompileMemory="176">
<RelOp AvgRowSize="19" EstimateCPU="1.42276" EstimateIO="0" EstimateRebinds="0" EstimateRewinds="0" EstimateRows="10000" LogicalOp="Aggregate" NodeId="0" Parallel="false" PhysicalOp="Hash Match" EstimatedTotalSubtreeCost="2.88526">
<OutputList>
<ColumnReference Database="[SPATIAL]" Schema="[dbo]" Table="[gps_ping]" Column="device_id" />
<ColumnReference Column="Expr1003" />
</OutputList>
<MemoryFractions Input="1" Output="1" />
<Hash>
<DefinedValues>
<DefinedValue>
<ColumnReference Column="Expr1003" />
<ScalarOperator ScalarString="MAX([SPATIAL].[dbo].[gps_ping].[ping_time])">
<Aggregate AggType="MAX" Distinct="false">
<ScalarOperator>
<Identifier>
<ColumnReference Database="[SPATIAL]" Schema="[dbo]" Table="[gps_ping]" Column="ping_time" />
</Identifier>
</ScalarOperator>
</Aggregate>
</ScalarOperator>
</DefinedValue>
</DefinedValues>
<HashKeysBuild>
<ColumnReference Database="[SPATIAL]" Schema="[dbo]" Table="[gps_ping]" Column="device_id" />
</HashKeysBuild>
<RelOp AvgRowSize="19" EstimateCPU="0.297157" EstimateIO="1.16535" EstimateRebinds="0" EstimateRewinds="0" EstimateRows="270000" LogicalOp="Clustered Index Seek" NodeId="1" Parallel="false" PhysicalOp="Clustered Index Seek" EstimatedTotalSubtreeCost="1.4625" TableCardinality="3000000">
<OutputList>
<ColumnReference Database="[SPATIAL]" Schema="[dbo]" Table="[gps_ping]" Column="ping_time" />
<ColumnReference Database="[SPATIAL]" Schema="[dbo]" Table="[gps_ping]" Column="device_id" />
</OutputList>
<IndexScan Ordered="true" ScanDirection="FORWARD" ForcedIndex="false" ForceSeek="false" NoExpandHint="false">
<DefinedValues>
<DefinedValue>
<ColumnReference Database="[SPATIAL]" Schema="[dbo]" Table="[gps_ping]" Column="ping_time" />
</DefinedValue>
<DefinedValue>
<ColumnReference Database="[SPATIAL]" Schema="[dbo]" Table="[gps_ping]" Column="device_id" />
</DefinedValue>
</DefinedValues>
<Object Database="[SPATIAL]" Schema="[dbo]" Table="[gps_ping]" Index="[PK_GPS_PING]" IndexKind="Clustered" />
<SeekPredicates>
<SeekPredicateNew>
<SeekKeys>
<StartRange ScanType="GE">
<RangeColumns>
<ColumnReference Database="[SPATIAL]" Schema="[dbo]" Table="[gps_ping]" Column="ping_time" />
</RangeColumns>
<RangeExpressions>
<ScalarOperator ScalarString="[@startDate]">
<Identifier>
<ColumnReference Column="@startDate" />
</Identifier>
</ScalarOperator>
</RangeExpressions>
</StartRange>
<EndRange ScanType="LE">
<RangeColumns>
<ColumnReference Database="[SPATIAL]" Schema="[dbo]" Table="[gps_ping]" Column="ping_time" />
</RangeColumns>
<RangeExpressions>
<ScalarOperator ScalarString="[@endDate]">
<Identifier>
<ColumnReference Column="@endDate" />
</Identifier>
</ScalarOperator>
</RangeExpressions>
</EndRange>
</SeekKeys>
</SeekPredicateNew>
</SeekPredicates>
</IndexScan>
</RelOp>
</Hash>
</RelOp>
</QueryPlan>
</StmtSimple>
</Statements>
</Batch>
</BatchSequence>
</ShowPlanXML>
I assume the clustered index is being used because the where clause is specifying a range of values on one of the clustered columns. But using the non-clustered index is faster (and has fewer reads according to STATISTICS IO).
I noticed that this only happens once the table reaches a certain threshold in size. At one million rows, I wasn't seeing this issue. I am considering using a forced index but I've read that forcing indexes is not a recommended practice.
For the table in this example, the time for the query to return the data isn't bad. But the real table is much larger with various other columns and the query takes up to 12-14 seconds. When I force the non-clustered index in the real table, the data comes back in 1 second.
So my questions are:
1) Is there anything I can do to get SQL Server to use the non-clustered index that doesn't involve brute force?
2) Is using a forced index acceptable in this case?
3) The query analyzer is showing the forced index query to have a much higher cost (10 versus 2 in the original). Does anyone know why?
I realize this is quite a bit to process but any thoughts or suggestions would be greatly appreciated...
May 13, 2010 at 2:56 am
The clustered index (primary key) is on the ping_time, device_id column. The only where clause predicate you have is on the ping_time column. Hence it makes sense to use the cluster to seek for that value.
The nonclustered index is not seekable. It has the device_id as the leading column, but you have no predicate (filter) on that column, hence there's nothing for SQL to seek on. It cannot seek only on the second column of an index.
Gail Shaw
Microsoft Certified Master: SQL Server, MVP, M.Sc (Comp Sci)
SQL In The Wild: Discussions on DB performance with occasional diversions into recoverability
May 13, 2010 at 10:19 am
I see. And when I modified the query to include a device_id in the where clause then it used the non-clustered index.
Thanks very much for the explanation.
May 13, 2010 at 10:39 am
8kb (5/13/2010)
I see. And when I modified the query to include a device_id in the where clause then it used the non-clustered index.
Yup, because now the index is seekable and is cheaper than the cluster.
Gail Shaw
Microsoft Certified Master: SQL Server, MVP, M.Sc (Comp Sci)
SQL In The Wild: Discussions on DB performance with occasional diversions into recoverability
May 15, 2010 at 11:20 am
8kb (5/12/2010)
When I try to get the most maximum ping_time of every device, SQL Server will not use the non-clustered index but does a clustered index seek instead. Even if the date range covers the entire table.
The problem here is not so much one of whether the index is seekable or not - the main problem is the use of local variables:
When SQL Server compiles the batch, it produces a plan that encompasses all the statements in that batch. This includes the variable declarations, setting the values of the variables, and the final SELECT query.
The important thing to appreciate is that the variables are not set to their particular values until execution time - so at compilation time, the values are unknown. The optimiser resorts to a hard-coded guess at the selectivity of the BETWEEN statement - 9% of the input cardinality in current versions of SQL Server. The table contains three million rows, and 9% of that is the 270,000 rows you see in the query plan.
This is wildly wrong for the values shown in your sample, which cover all three million rows. If the optimiser had that information, it would choose to scan the (narrower) non-clustered index instead of seeking on the clustered index.
For other values, the clustered index seek might be a better choice - though not by as much as you might think. Assuming that the non-clustered index is in reasonably good shape fragmentation-wise, scanning the index and applying the BETWEEN predicate to each row is a pretty fast operation.
The index scan also benefits the GROUP BY: by providing data in the correct order, the optimiser can use a Stream Aggregate operator, rather than the Hash Aggregate used in the clustered index plan. The Hash Aggregate requires a memory grant, which is expensive and may cause the query to wait until memory becomes available.
The memory grant required is calculated at compile time, based in part on the estimated number of rows. On a system experiencing at least moderate memory contention, this might result in the Hash operation spilling to disk. This is just one of several factors that the current optimiser does not consider.
Anyway, enough background, onto the questions.
Is there anything I can do to get SQL Server to use the non-clustered index that doesn't involve brute force?
Yes, give the optimiser better information. There are three main ways to achieve this: (1) set the values in a higher scope; (2) recompile the SELECT statement every time it runs; or (3) use a hint.
A higher scope might mean using sp_executesql, or passing the values in as parameters to a procedure or function, for example. This is not a perfect solution since SQL Server will by default detect the parameter values at compilation time and cache a plan optimised for those specific values. That's great if the values used are typical, and produce a plan that works well for any input. That is probably not the case here.
Recompiling each time involves adding an OPTION (RECOMPILE) query hint to the SELECT statement. This is a often a good choice since the values of the variables or parameters are known when the statement recompiles, so SQL Server will typically produce a high-quality plan. The downside is that the query is recompiled every time - with the expected effects on CPU and memory use. It is probably a good choice for this query.
The third option is to use a OPTION(OPTIMIZE FOR...) hint. This is again problematic since there is no typical values which produce a good plan for all other values. The 2008 OPTIMIZE FOR UNKNOWN option does not help, since this just uses the same 9% guess in this case.
Is using a forced index acceptable in this case?
I would say not, since the non-clustered index is not optimal in all cases. If you do decide that SQL Server should use the non-clustered index in all cases, OPTION (FAST 1) will produce the same effect, without hard-coding the name of an index. My preferred option is still OPTION (RECOMPILE).
The query analyzer is showing the forced index query to have a much higher cost (10 versus 2 in the original). Does anyone know why?
The cost is the estimated cost used when the optimiser selected the plan. It never represents the real run-time cost, and should only ever be used to understand why the optimiser picked one option over the other. In this case, the estimates are wrong due to the 9% guess.
Paul
May 15, 2010 at 11:52 pm
Thanks for taking the time to write up this explanation. I'm going to try the OPTION (RECOMPILE) as you suggested. I think a bigger problem is the requirement to find the most recent device locations within a date range that can span the entire table. I have to figure out a way around that. Thanks again.
May 16, 2010 at 2:01 am
8kb (5/15/2010)
Thanks for taking the time to write up this explanation. I'm going to try the OPTION (RECOMPILE) as you suggested. I think a bigger problem is the requirement to find the most recent device locations within a date range that can span the entire table. I have to figure out a way around that. Thanks again.
This might give you a few ideas: (I have made some changes to the table and indexing)
-- Switch to a test database
USE Sandpit;
-- Table with log of gps device pings
IF OBJECT_ID('dbo.GPSping')
IS NOT NULL
DROP TABLE dbo.GPSPing;
CREATE TABLE dbo.GPSping
(
row_id INTEGER IDENTITY NOT NULL
CONSTRAINT [CUQ dbo.GPSping row_id]
UNIQUE CLUSTERED,
ping_time DATETIME2 NOT NULL,
device_id INTEGER NOT NULL,
device_location GEOGRAPHY NOT NULL,
);
GO
-- Load table with 3,000,000 rows - 10,000 devices which ping every minute for 300 minutes
INSERT dbo.GPSping WITH (TABLOCK)
(
ping_time,
device_id,
device_location
)
SELECT ping_time,
device_id,
geography::STPointFromText('POINT(-100 30)',4326)
FROM (
SELECT ping_time = DATEADD(MINUTE, R.row_num, '2010-01-01T00:00:00')
FROM (
SELECT TOP (300)
row_num = ROW_NUMBER()
OVER (ORDER BY (SELECT 0))
FROM sys.all_objects
) R
) P
CROSS
JOIN (
SELECT TOP (10000)
device_id = ROW_NUMBER()
OVER (ORDER BY (SELECT 0))
FROM sys.all_objects s1
CROSS
JOIN sys.all_objects s2
) D;
-- Add a nonclustered primary key
ALTER TABLE dbo.GPSping
ADD CONSTRAINT [PK dbo.GPSping device_id, ping_time]
PRIMARY KEY NONCLUSTERED (device_id, ping_time);
GO
-- There might be a table of unique devices already
-- If so, use it in place of this view
CREATE VIEW dbo.Device
WITH SCHEMABINDING
AS
SELECT G.device_id,
record_count = COUNT_BIG(*)
FROM dbo.GPSping G
GROUP BY
G.device_id;
GO
-- Materialise the view of distinct devices
CREATE UNIQUE CLUSTERED INDEX cuq ON dbo.Device (device_id);
GO
-- General function to return the top 'n' ping times
-- for each device over a time period
CREATE FUNCTION dbo.TopN_PingTimes
(
@Start DATETIME2,
@End DATETIME2,
@TopN BIGINT
)
RETURNS TABLE
WITH SCHEMABINDING
AS RETURN
SELECT D.device_id,
HP.ping_time
FROM dbo.Device D WITH (NOEXPAND)
CROSS
APPLY (
-- Highest ping time in the range specified
SELECT TOP (@TopN)
G.ping_time
FROM dbo.GPSping G
WHERE G.device_id = D.device_id
AND G.ping_time BETWEEN @Start AND @End
ORDER BY
G.ping_time DESC
) HP;
GO
-- Drop plans, write dirty buffers to disk, and
-- drop clean buffers.
DBCC FREEPROCCACHE;
CHECKPOINT;
DBCC DROPCLEANBUFFERS;
GO
DECLARE @Start DATETIME2 = '2010-01-01T00:00:00',
@End DATETIME2 = '2010-01-01T02:30:00';
-- Very fast for all date ranges
SELECT TNP.device_id,
TNP.ping_time
FROM dbo.TopN_PingTimes (@Start, @End, 1) TNP;
-- Top 10 pings for a particular device
-- Notice the execution plan changes
SELECT TNP.device_id,
TNP.ping_time
FROM dbo.TopN_PingTimes('2010-01-01T00:00:00', '2010-01-01T01:00:00', 10) TNP
WHERE TNP.device_id = 4983;
GO
-- Tidy up
DROP FUNCTION dbo.TopN_PingTimes;
DROP VIEW dbo.Device;
DROP TABLE dbo.GPSping;
May 18, 2010 at 9:15 pm
Apologize for not responding sooner but I'm looking forward to trying this out in the test table and our production set. Thanks again.
May 19, 2010 at 2:58 am
8kb (5/18/2010)
Apologize for not responding sooner but I'm looking forward to trying this out in the test table and our production set. Thanks again.
No worries. Good luck!
Viewing 9 posts - 1 through 8 (of 8 total)
You must be logged in to reply to this topic. Login to reply