Thursday, June 9, 2011

Remove Duplicate Records From Table

A very known question to find Duplicate Rows from the Table and Delete Duplicate rows from the table.
1.Find Duplicate records in the table ,but the table contains One Identity Column. 
CREATE TABLE FindDuplicates(
EmpID INT IDENTITY(1,1),
EmpName VARCHAR(500)
)
INSERT INTO FindDuplicates VALUES('A')
INSERT INTO FindDuplicates VALUES('A')
INSERT INTO FindDuplicates VALUES('B')
INSERT INTO FindDuplicates VALUES('C')
INSERT INTO FindDuplicates VALUES('C')
INSERT INTO FindDuplicates VALUES('D')
INSERT INTO FindDuplicates VALUES('D')
Lets see the use of Row_Number():
With CTE AS(
SELECT ROW_NUMBER() OVER(PARTITION BY EmpNameORDER BY EmpName) As RowID,
EmpID,
EmpName
FROM FindDuplicates
)
SELECT *
FROM CTE
WHERE RowID > 1
Output will be like:
rowId EmpID EmpName
2      2     A
2      5     C
2      7     D
 
2. Find and delete Duplicate Records from Table.(There is No Primary Column in the Table)Or Find the Duplicate records from the table which Does not contain any Primary Key. 
ID  FNAME  LNAME
1    AAA     CCC
2    BBB     DDD
1    AAA     CCC
2    BBB     DDD
1    AAA     CCC
2    BBB     DDD
3    BCB     DGD
Remove duplicate rows and keep the data in to the table like this using single query.
ID FNAME LNAME
1 AAA CCC
2 BBB DDD
3 BCB DGD

Use the temperary variable and CTE for make this example Quick and fast
Create the Temporary table  using temporary variable.
declare @TBL table(EmpID int)
insert into @TBL(EmpID)
values(90020),(90020),(90021),(90022),(90023),(90022),(90025),(90022)
select * from @TBL order by EmpID
Now USE CTE,for Finding Duplicate records from the Table
;with TBLCTE(EmpID,Ranking)
AS
(
select EmpID,
Ranking = DENSE_RANK() over (PARTITION BY EmpID order by newID())
from @TBL
)
delete from TBLCTE where Ranking > 1
select * from @TBL order by EmpID
 3.Find Duplicate Records in a single select statement with out using the Row_number() or CTE .
Create Some Sample Data:
Create Table SelectDuplicate
(
ID int identity(1,1) not null,
Fname varchar(10) not null,
Lname varchar(10) not null,
City varchar(10) not null
)
Now, this is a time to enter some records in just created table above.
INSERT INTO SelectDuplicate VALUES(‘pushpendra’,‘dhakad’,‘Ahmedabad’)
INSERT INTO SelectDuplicate VALUES(‘Avi’,‘Sagiv’,‘Edison’)
INSERT INTO SelectDuplicate VALUES(‘Dharmesh’,‘Kalaria’,‘Parsipenny’)
INSERT INTO SelectDuplicate VALUES(‘pushpendra’,‘dhakad’,‘WestField’)
INSERT INTO SelectDuplicate VALUES(‘Dharmesh’,‘Kalaria’,‘Ahmedabad’)
So, here is the heart of article below.
SELECT * FROM SelectDuplicate
WHERE ID NOT IN
(
SELECT MAX(ID) FROM SelectDuplicate
GROUP BY Fname,Lname
)
4.Find Duplicate Records from the Table which  contain multiple columns to create  Primary key and delete records from Table.
         CREATE TABLE #Fact_Policy_Tracker(
24.     [identity] [int] identity,
25.      [Client_Key] [smallint]  NULL,
26.      [Date_Key] [int]  NULL,
27.      [Target_Volume] [decimal](10, 4) NULL,
28.      [Target_Return] [nvarchar](20) NULL,
29.      [Iliquid Allocation] [decimal](10, 4) NULL,
30.      [Liquid Allocation] [decimal](10, 4) NULL,
31.      [BATCH_ID] [int] NULL,
32.      [INSERT_UPDATE_IND] [char](1) NULL
33.      )
34.GO   
35.insert into #Fact_Policy_Tracker
36.(
37.      [Client_Key],
38.      [Date_Key],
39.      [Target_Volume],
40.      [Target_Return],
41.      [Iliquid Allocation],
42.      [Liquid Allocation],
43.      [BATCH_ID],
44.      [INSERT_UPDATE_IND]
45.)    
46.select
47.    [Client_Key],
48.      [Date_Key],
49.      [Target_Volume],
50.      [Target_Return],
51.      [Iliquid Allocation],
52.      [Liquid Allocation],
53.      [BATCH_ID],
54.      [INSERT_UPDATE_IND]
55.from [DW].[Fact_Policy_Tracker]
56.GO
57.------------------------------------------------------------------------
58.With CTE AS(
59.select
60.RANK() over(partition by Client_Alternate_Key,Date_Key,Target_Volume,Target_Return,
61.                                          [Iliquid Allocation],[Liquid Allocation] order by [identity]) as RowID,
62.s.*,c.Client_Alternate_Key
63.from #Fact_Policy_Tracker s
64.INNER JOIN dw.Dim_Client c on s.Client_Key = c.Client_Key
65.)
66.INSERT INTO rej.Fact_Policy_Tracker_REJ
67.(
68.      [Client_ID]
69.      ,[Date_Key]
70.      ,[Target_Volume]
71.      ,[Target_Return]
72.      ,[Iliquid Allocation]
73.      ,[Liquid Allocation]
74.      ,[BATCH_ID]
75.      ,[ERROR_IND]
76.      ,[ERROR_TEXT]
77.      ,[SOURCE_NAME]
78.)
79.SELECT           
80.         ip.Client_Alternate_Key
81.        ,ip.[Date_Key]
82.        ,ip.Target_Volume
83.        ,ip.Target_Return
84.        ,ip.[Iliquid Allocation]
85.        ,ip.[Liquid Allocation]
86.      ,ip.[BATCH_ID]
87.      ,'Y'
88.      ,'Hot Fix Duplicate Records'
89.      ,''
90.FROM CTE  IP
91.WHERE RowID >1
92.GO
93.--------------------------------------------------------------------------
94.With CTE AS(
95.select
96.RANK() over(partition by Client_Alternate_Key,Date_Key,Target_Volume,Target_Return,
97.                                          [Iliquid Allocation],[Liquid Allocation] order by [identity]) as RowID,
98.s.*,c.Client_Alternate_Key
99.from #Fact_Policy_Tracker s
100.        INNER JOIN dw.Dim_Client c on s.Client_Key = c.Client_Key
101.        )
102.        DELETE FROM #Fact_Policy_Tracker
103.        where [identity] in (select [identity] from CTE where RowID >1)
104.       
105.        GO
106.        ---------------------------------------------------------
107.        truncate table DW.Fact_Policy_Tracker
108.        GO
109.        insert into DW.Fact_Policy_Tracker
110.        (
111.              [Client_Key],
112.              [Date_Key],
113.              [Target_Volume],
114.              [Target_Return],
115.              [Iliquid Allocation],
116.              [Liquid Allocation],
117.              [BATCH_ID],
118.              [INSERT_UPDATE_IND]
119.        )
120.        select
121.            [Client_Key],
122.              [Date_Key],
123.              [Target_Volume],
124.              [Target_Return],
125.              [Iliquid Allocation],
126.              [Liquid Allocation],
127.              [BATCH_ID],
128.              [INSERT_UPDATE_IND]
129.        from #Fact_Policy_Tracker
130.        GO
131.        ---------Update the Fact_Portfolio_Risk_Summary------------
132.        UPDATE PST
133.        Set PST.Client_Key =C.Client_Key
134.        FROM
135.            DW.Fact_Policy_Tracker PST,
136.            DW.Dim_Client C
137.        WHERE
138.            C.Client_Alternate_Key = (SELECT DISTINCT Client_Alternate_Key FROM DW.Dim_Client WHERE Client_Key = PST.Client_Key)
139.            AND PST.Date_Key between MD.fnGetDateKey(C.[Start_Date]) and MD.fnGetDateKey(ISNULL(C.END_DATE, GETDATE()))
140.            and PST.Client_Key <> c.Client_Key
141.        GO
142.       
143.       
144.        Drop table #Fact_Policy_Tracker
145.           GO



Now, we need to Find the duplicate Rows from the table. So we need to get “A”, “C” and “D”.
 Sample data:

No comments:

Post a Comment