August 25, 2009 at 6:24 am
Ok....my syntax is below. I am running a merge statement that merges incomming data into a dimension table.
I have a clustered index on the id column and a non-clustered index on the columns in the join.
This query is not using my non-clustered index at any point. except to update it at the end.
What can I do to change this. I can't provide any index hints to the target table!! I can provide to the temp table, but that is a mute point.
MERGE datawarehouse.DIM.T_DM_POLICY dp
USING #tmp s ON
s.cbdwmco = dp.POL_MASTER_COMPANY_NUMBER AND
s.cbdwpolsym = dp.POL_POLICY_SYMBOL AND
s.cbdwpolnum = dp.POL_POLICY_NUMBER AND
s.cbdwpolmod = dp.POL_POLICY_MODULE
WHEN NOT MATCHED THEN INSERT (
POL_POLICY_NUMBER,
POL_POLICY_MODULE,
POL_POLICY_SYMBOL,
POL_MASTER_COMPANY_NUMBER,
POL_PREVIOUS_POLICY_NUMBER,
POL_EFFECTIVE_DATE_REF,
POL_EXPIRATION_DATE_REF,
POL_ORIGINAL_ENTERED_DATE_REF,
POL_ORIGINAL_INCEPTION_DATE_REF,
POL_GUARANTEED_RENEWAL_DATE_REF,
POL_TERM_STATUS,
POL_INFORCE_IND,
POL_POLICY_STATUS,
POL_NUM_OF_NON_PAY,
POL_NUM_OF_REINSTATES,
POL_NUM_OF_CANCELS,
POL_LAST_CANCEL_DATE_REF,
POL_LAST_REINSTATE_DATE_REF,
POL_LAST_CANCEL_REASON,
POL_MAX_REINSTATES,
POL_MAX_INSURANCE_SCORE,
POL_MAX_AGE_CALC,
POL_ACCOUNT_NUMBER,
POL_PIF_PRE_RENEWAL_IND,
POL_NEW_RENEWAL_IND,
POL_NEW_RENEWAL_DATE_REF,
POL_NUM_OF_NONPAY_PRIOR_TERM_3YR,
POL_MAX_TERM_INSURANCE_SCORE,
POL_LAST_UPDATED_DATE_REF,
POL_MIN_TERM_INSURANCE_SCORE,
POL_PROCESS_IND,
POL_CURRENT_MODULE_IND,
--POL_REWRITE_IND,
POL_PRIOR_POLICY_ON_REWRITE,
POL_PROFIT_CENTER_DIVISION_ID,
POL_CURRENT_PRODUCER_CD,
POL_CANCEL_STATUS_60_DAY,
POL_CANCEL_REASON_60_DAY,
--POL_ETL_FILE_ID,
POL_ACTIVITY_DATE,
--POL_REWRITE_IND_2,
--POL_PRIOR_POLICY_ON_REWRITE_2,
POL_CURRENT_DM_AGENCY_ID,
POL_CLIENT_ID,
POL_MUDS_SSN,
POL_CUSTOMER_ADDRESS_1,
POL_CUSTOMER_ADDRESS_2,
POL_CUSTOMER_CITY,
POL_CUSTOMER_EMAIL,
POL_CUSTOMER_NAME_1,
POL_CUSTOMER_NAME_2,
POL_CUSTOMER_OPT_OUT,
POL_CUSTOMER_PHONE,
POL_CUSTOMER_PRINT_ADDRESS_1,
POL_CUSTOMER_PRINT_ADDRESS_2,
POL_CUSTOMER_PRINT_ADDRESS_3,
POL_CUSTOMER_PRINT_ADDRESS_4,
POL_CUSTOMER_SORT_NAME,
POL_CUSTOMER_STATE,
POL_CUSTOMER_ZIP,
POL_CUSTOMER_ZIP_4,
POL_ADDRESS_SCRUBBED_IND
)
VALUES(
s.cbdwpolnum,
s.cbdwpolmod,
s.cbDwPolsym,
s.cbdwmco,
coalesce(cbRenewPolicyNumber,null,''),
coalesce( cbEffectiveDate,null,''),
coalesce( cbExpirationDate,null,''),
coalesce( cbEnteredDate,null,''),
coalesce( cbOriginalIncept,null,''),
coalesce(Guaranteed_renewal,null,''),
coalesce(term_status,null,''),
coalesce( exInforceInd,null,''),
coalesce( exPolicyStatus,null,''),
coalesce(NONPAYS,NULL,0),
coalesce(reinstates,NULL,0),
coalesce(NumofCancels,NULL,0),
coalesce(exLastCancelDateRef,NULL,0),
coalesce(Last_Reinstate,NULL,0),
coalesce(exLast_Cancel_Reason,NULL,''),
coalesce(maxreinstates,null,0),
coalesce(max_insurance_score,null,0),
coalesce(maxage,null,0),
coalesce( cbCustomerNumber,null,''),
coalesce( exPIFPreRenewalInd,null,''),
CASE
WHEN cbTransaction IN('10', '18') THEN 'New'
WHEN cbTransaction IN('11', '19') THEN 'Renewal'
ELSE 'Unknown'
END,
coalesce(cbAccountEnteredDate,null,''), --was s.exNewOrRenewalDateRef,
0,
0,
20030319,
'0000',
coalesce( cbProcessInd,null,''),
0,
--'',
'',
coalesce( exProfitCenterDivisionId, NULL,0),
coalesce( cbProducerCode,null,''),
'',
'',
--0, --was coalesce(s.exEtlFileId, NULL,0),
getdate(),
--'',
--'',
0,
coalesce(clientid,null,''),
coalesce(mudssn,null,''),
coalesce( exCustomerAddress1,null,''),
coalesce( exCustomerAddress2,null,''),
coalesce( exCustomerCity,null,''),
coalesce( cbEmail,null,''),
coalesce( exCustomerName1,null,''),
coalesce( exCustomerName2,null,''),
coalesce( cbOpt,null,''),
coalesce( cbPhone,null,''),
coalesce( cbAddressLine1,null,''),
coalesce( cbAddressLine2,null,''),
coalesce( cbAddressLine3,null,''),
coalesce( cbAddressLine4,null,''),
coalesce( cbSortName,null,''),
coalesce( exCustomerState,null,''),
coalesce( cbZip5DigitCode,null,''),
coalesce( cbPlusFourZip,null,''),
coalesce( cbAddressScrubbed,null,'')
)
WHEN MATCHED THEN
UPDATE
SET
POL_GUARANTEED_RENEWAL_DATE_REF = coalesce(Guaranteed_renewal,null,''),
POL_TERM_STATUS = coalesce(term_status,null,''),
POL_INFORCE_IND = coalesce( exInforceInd,null,''),
POL_POLICY_STATUS = coalesce( exPolicyStatus,null,''),
POL_NUM_OF_NON_PAY = coalesce(NONPAYS,NULL,0),
POL_NUM_OF_REINSTATES = coalesce(reinstates,NULL,0),
POL_NUM_OF_CANCELS = coalesce(NumofCancels,NULL,0),
POL_LAST_CANCEL_DATE_REF = coalesce(exLastCancelDateRef,NULL,0),
POL_LAST_REINSTATE_DATE_REF = coalesce(Last_Reinstate,NULL,0),
POL_LAST_CANCEL_REASON = coalesce(exLast_Cancel_Reason,NULL,''),
POL_ACCOUNT_NUMBER = coalesce( cbCustomerNumber,null,''),
POL_PIF_PRE_RENEWAL_IND = coalesce( exPIFPreRenewalInd,null,''),
--CURRENT_POLICY_LOAD_ID = s.PolicyLoadId,
POL_LAST_UPDATED_DATE_REF = 20030319,
POL_PROCESS_IND = coalesce( cbProcessInd,null,''),
POL_PROFIT_CENTER_DIVISION_ID = coalesce( exProfitCenterDivisionId, NULL,0),
POL_CURRENT_PRODUCER_CD = coalesce( cbProducerCode,null,''),
--POL_ETL_FILE_ID = 0,
POL_ACTIVITY_DATE = getdate(),
POL_CLIENT_ID = coalesce(clientid,null,''),
POL_MUDS_SSN =coalesce(mudssn,null,''),
POL_CUSTOMER_ADDRESS_1=coalesce( exCustomerAddress1,null,''),
POL_CUSTOMER_ADDRESS_2=coalesce( exCustomerAddress2,null,''),
POL_CUSTOMER_CITY=coalesce( exCustomerCity,null,''),
POL_CUSTOMER_EMAIL=coalesce( cbEmail,null,''),
POL_CUSTOMER_NAME_1=coalesce( exCustomerName1,null,''),
POL_CUSTOMER_NAME_2=coalesce( exCustomerName2,null,''),
POL_CUSTOMER_OPT_OUT=coalesce( cbOpt,null,''),
POL_CUSTOMER_PHONE=coalesce( cbPhone,null,''),
POL_CUSTOMER_PRINT_ADDRESS_1=coalesce( cbAddressLine1,null,''),
POL_CUSTOMER_PRINT_ADDRESS_2=coalesce( cbAddressLine2,null,''),
POL_CUSTOMER_PRINT_ADDRESS_3=coalesce( cbAddressLine3,null,''),
POL_CUSTOMER_PRINT_ADDRESS_4=coalesce( cbAddressLine4,null,''),
POL_CUSTOMER_SORT_NAME=coalesce( cbSortName,null,''),
POL_CUSTOMER_STATE=coalesce( exCustomerState,null,''),
POL_CUSTOMER_ZIP=coalesce( cbZip5DigitCode,null,''),
POL_CUSTOMER_ZIP_4=coalesce( cbPlusFourZip,null,''),
POL_ADDRESS_SCRUBBED_IND = coalesce( cbAddressScrubbed,null,'')
;
August 25, 2009 at 6:44 am
Why do you think the seek will be more efficient? If SQL does a seek on the NC index, it will have to do one seek for each matching value in the other table. Optimiser's probably decided that a scan is overall more efficient.
How many rows will there be in the temp table?
You might be able to use the with index and the forceseek hints in an option clause. Books Online will have the details. If you do that, please test that it really is more efficient.
Gail Shaw
Microsoft Certified Master: SQL Server, MVP, M.Sc (Comp Sci)
SQL In The Wild: Discussions on DB performance with occasional diversions into recoverability
August 25, 2009 at 6:49 am
good point Gila, I do not know if it is more efficient.
What I was driving at, though, is that the merge syntax doesn't seem to allow index hints on the target table.
I am curious to know what my options are with that.
The dimension table will be millions of rows and my temp table will be 100,000 or so....making the dimension table my primary focus.
perhaps the scan vs. seek point does not apply, but shouldn't the query plan use the non-clustered index at some point?
Just seems odd not to.
Thanks for such a quick response.. This site is GREAT
August 25, 2009 at 6:53 am
I am actually moving a datawarehouse from 2000 into 2008 with tons of modeling changes.
Here is the syntax for how I am batching the popuation of this particular dimension. I am open WAY open to constructive ctritisisms.
declare @cur cursor,
@moe_date as int
set @cur = cursor local fast_forward for
select distinct moe_date from staging.dbo.t_sm_risk (nolock)
order by moe_date
open @cur
fetch next from @cur into @moe_date
while @@fetch_status = 0
begin
select distinct
cbDwPolnum,
cbDwPolmod,
cbDwPolsym,
cbDwMco,
ExUnitNumber,
ExSubUnitNumber,
ExRiskSeq,
exRiskType,
max(cbSerialNumber) as cbSerialNumber,
max(exRiskStatus) as exRiskStatus,
max(cbRiskStateProv) as cbRiskStateProv,
max(cbZip5DigitCode)as cbZip5DigitCode ,
max(cbAddressLine1) as cbAddressLine1,
max(cbAddressLine2) as cbAddressLine2,
max(cbAddressLine3) as cbAddressLine3,
max(cbAddressLine4) as cbAddressLine4,
max(cbaddressScrubbed) as cbaddressScrubbed,
max(cbPlusFourZip) as cbPlusFourZip,
max(cbDescGlLongitude) as cbDescGlLongitude,
max(cbDescGLLatitude) as cbDescGLLatitude,
max(cbCountys45) as cbCountys45,
max(cbDwCitys45) as cbDwCitys45,
max(AddressType) as AddressType,
max(cbLocProvTerritorys45) as cbLocProvTerritorys45
into #tmp
from staging.dbo.t_sm_risk (nolock)
where moe_date=@moe_date
group by
cbDwPolnum,
cbDwPolmod,
cbDwPolsym,
cbDwMco,
ExUnitNumber,
ExSubUnitNumber,
ExRiskSeq,
exRiskType
MERGE INTO datawarehouse.DIM.T_DM_RISK
USING #tmp ON
cbDwPolnum = RSK_POLICY_NUMBER and
cbDwPolmod = RSK_POLICY_MODULE and
cbDwPolsym = RSK_POLICY_SYMBOL and
cbDwMco = RSK_MASTER_COMPANY_NUMBER and
ExUnitNumber = RSK_UNIT_NUMBER and
ExSubUnitNumber = RSK_SUB_UNIT_NUMBER and
ExRiskSeq = RSK_RISK_SEQ and
exRiskType = RSK_RISK_TYPE
WHEN NOT MATCHED THEN INSERT (
RSK_POLICY_NUMBER,
RSK_POLICY_MODULE,
RSK_POLICY_SYMBOL,
RSK_MASTER_COMPANY_NUMBER,
RSK_UNIT_NUMBER,
RSK_SUB_UNIT_NUMBER,
RSK_RISK_SEQ,
RSK_RISK_TYPE,
RSK_VIN,
RSK_RISK_STATUS,
RSK_RISK_STATE,
RSK_RISK_ZIP,
RSK_RISK_PRINT_ADDRESS_1,
RSK_RISK_PRINT_ADDRESS_2,
RSK_RISK_PRINT_ADDRESS_3,
RSK_RISK_PRINT_ADDRESS_4,
RSK_RISK_ADDRESS_SCRUBBED_IND,
RSK_RISK_ZIP_4,
RSK_RISK_LONGITUDE,
RSK_RISK_LATITUDE,
RSK_COUNTY,
RSK_TAXLOC,
RSK_ADDRESS_TYPE,
RSK_TERRITORY,
RSK_ACTIVITY_DATE
)
VALUES(
cbDwPolnum,
cbDwPolmod,
cbDwPolsym,
cbDwMco,
ExUnitNumber,
ExSubUnitNumber,
ExRiskSeq,
exRiskType,
coalesce(cbSerialNumber,' '),
coalesce(exRiskStatus,' '),
coalesce(cbRiskStateProv,' '),
coalesce(cbZip5DigitCode,' '),
coalesce(cbAddressLine1,' '),
coalesce(cbAddressLine2,' '),
coalesce(cbAddressLine3,' '),
coalesce(cbAddressLine4,' '),
coalesce(cbaddressScrubbed,' '),
coalesce(cbPlusFourZip,' '),
coalesce(cbDescGlLongitude,' '),
coalesce(cbDescGLLatitude,' '),
coalesce(cbCountys45,' '),
coalesce(cbDwCitys45,' '),
AddressType,
coalesce(cbLocProvTerritorys45,' '),
getdate()
)
WHEN MATCHED THEN
UPDATE
SET
RSK_POLICY_NUMBER = cbDwPolnum,
RSK_POLICY_MODULE = cbDwPolmod,
RSK_POLICY_SYMBOL = cbDwPolsym ,
RSK_MASTER_COMPANY_NUMBER = cbDwMco,
RSK_UNIT_NUMBER = ExUnitNumber,
RSK_SUB_UNIT_NUMBER = ExSubUnitNumber,
RSK_RISK_SEQ = ExRiskSeq,
RSK_RISK_TYPE = exRiskType,
RSK_VIN = coalesce(cbSerialNumber,' '),
RSK_RISK_STATUS = coalesce(exRiskStatus,' '),
RSK_RISK_STATE = coalesce(cbRiskStateProv,' '),
RSK_RISK_ZIP = coalesce(cbZip5DigitCode,' '),
RSK_RISK_PRINT_ADDRESS_1 = coalesce(cbAddressLine1,' '),
RSK_RISK_PRINT_ADDRESS_2 = coalesce(cbAddressLine2,' '),
RSK_RISK_PRINT_ADDRESS_3 = coalesce(cbAddressLine3,' '),
RSK_RISK_PRINT_ADDRESS_4 = coalesce(cbAddressLine4,' '),
RSK_RISK_ADDRESS_SCRUBBED_IND = coalesce(cbAddressScrubbed,' '),
RSK_RISK_ZIP_4 = coalesce(cbPlusFourZip,' '),
RSK_RISK_LONGITUDE = coalesce(cbDescGlLongitude,' '),
RSK_RISK_LATITUDE = coalesce(cbDescGLLatitude,' '),
RSK_COUNTY = coalesce(cbCountys45,' '),
RSK_TAXLOC = coalesce(cbDwCitys45,' '),
RSK_ADDRESS_TYPE = AddressType,
RSK_TERRITORY = coalesce(cbLocProvTerritorys45,' ')
;
drop table #tmp
fetch next from @cur into @moe_date
end
close @cur
deallocate @cur
August 25, 2009 at 7:00 am
villersk (8/25/2009)
What I was driving at, though, is that the merge syntax doesn't seem to allow index hints on the target table.
Did you try using the OPTION clause?
perhaps the scan vs. seek point does not apply, but shouldn't the query plan use the non-clustered index at some point?
Depends, not if it's more efficient to just scan the cluster.
If SQL were to do seeks against the dimension table, then it would have to do one seek for each row in the temp table. That's 100,000 or so seeks, each probably taking 3 or 4 page reads. Then it will have to lookup to the cluster (again once per row) to get the actual row to modify (for the update portion). That's a lot of work compared to just scanning the cluster.
It could scan the noncluster, then do lookups to the cluster, but that's still going to require 100,000 or so lookups to the cluster to get the records.
Sometimes a table scan really is the fastest way to do things.
Gail Shaw
Microsoft Certified Master: SQL Server, MVP, M.Sc (Comp Sci)
SQL In The Wild: Discussions on DB performance with occasional diversions into recoverability
August 25, 2009 at 7:13 am
cool....thanks for the input.
I am having pretty good overall performance with this, just trying to tighten up anything I can.
Just read that if the source table is a considerable amount smaller than the target it may be best to perform a nested loop instead of hash join.
OPTION (LOOP JOIN);
We'll see. Not sure how small we are talking. I will post my findings back here.
August 25, 2009 at 7:26 am
What I was talking about above is a loop join. 100000 seeks against the dimension table, then all those lookups to the cluster to get the row.
Generally hints are not necessary unless you're run onto one of the odd edge cases where the optimiser makes a bad decision. Not generally recommended.
Why do you have a cursor anyway? What's wrong with doing the entire merge in one statement?
Gail Shaw
Microsoft Certified Master: SQL Server, MVP, M.Sc (Comp Sci)
SQL In The Wild: Discussions on DB performance with occasional diversions into recoverability
August 25, 2009 at 7:36 am
I've seen the query optimiser make some seriously bad decisions....I don't think this one is soo bad.
...but, the query plan is using a hash-match and needs to be told to use the nested loop..
I still don't have results back on what would be better in my case, but this article sums it up.
http://technet.microsoft.com/en-us/library/cc879317.aspx
-----------
On the cursor thing, The system gets a file for everyday. I need to be sure I update the most recent data...I suppose I could run everything except the max date in one swoop and then run the max..that way I would still have all the most recent values for a given row.
Good call btw.
August 25, 2009 at 7:43 am
villersk (8/25/2009)
I've seen the query optimiser make some seriously bad decisions....I don't think this one is soo bad.
It can, in most cases that's due to badly written queries or out of date statistics
...but, the query plan is using a hash-match and needs to be told to use the nested loop..
Why?
The nested loop join is great for joining small tables and it is usually the best join when one table is quite small.
The hash join is the heavy-lifter among the joins, it's the most efficient one for joining massive resultsets together.
Quite frankly, if I saw an exec plan with a nested loop join between a few million row table and a few hundred thousand row table, I would be suspecting that the optimiser made a bad decision and would check that the row estimates were right.
Gail Shaw
Microsoft Certified Master: SQL Server, MVP, M.Sc (Comp Sci)
SQL In The Wild: Discussions on DB performance with occasional diversions into recoverability
August 25, 2009 at 7:47 am
indeed...this query may be optimum...thanks
Viewing 10 posts - 1 through 9 (of 9 total)
You must be logged in to reply to this topic. Login to reply