Distinct
This article contains example code snippets that show how to use the Distinct component.
Filtering Duplicates with DistinctColumn
This example demonstrates using the Distinct
component to eliminate duplicate records in a data flow. The DistinctColumn
attribute specifies which properties to use for identifying duplicates.
public class MyRow
{
[DistinctColumn]
public int Id { get; set; }
[DistinctColumn]
public string Value { get; set; }
public string TestId { get; set; }
}
var source = new MemorySource<MyRow>();
source.DataAsList.Add(new MyRow() { Id = 1, Value = "A", TestId = "Test1" });
source.DataAsList.Add(new MyRow() { Id = 2, Value = "A", TestId = "Test2" });
source.DataAsList.Add(new MyRow() { Id = 2, Value = "B", TestId = "Test3" });
source.DataAsList.Add(new MyRow() { Id = 1, Value = "A", TestId = "Test4" });
source.DataAsList.Add(new MyRow() { Id = 2, Value = "A", TestId = "Test5" });
source.DataAsList.Add(new MyRow() { Id = 3, Value = "B", TestId = "Test6" });
var trans = new Distinct<MyRow>();
var dest = new MemoryDestination<MyRow>();
source.LinkTo(trans);
trans.LinkTo(dest);
Network.Execute(source);
foreach (var row in dest.Data)
Console.WriteLine($"Id:{row.Id} Value:{row.Value} TestId:{row.TestId}");
//Output
/*
Id:1 Value:A TestId:Test1
Id:2 Value:A TestId:Test2
Id:2 Value:B TestId:Test3
Id:3 Value:B TestId:Test6
*/
Redirecting Duplicates to an Alternate Flow
This example illustrates how to use the LinkDuplicatesTo
method to send duplicates to a different data flow.
var source = new MemorySource<MyRow>();
source.DataAsList.Add(new MyRow() { Id = 1, Value = "A", TestId = "Test1" });
source.DataAsList.Add(new MyRow() { Id = 2, Value = "A", TestId = "Test2" });
source.DataAsList.Add(new MyRow() { Id = 2, Value = "B", TestId = "Test3" });
source.DataAsList.Add(new MyRow() { Id = 1, Value = "A", TestId = "Test4" });
source.DataAsList.Add(new MyRow() { Id = 2, Value = "A", TestId = "Test5" });
source.DataAsList.Add(new MyRow() { Id = 3, Value = "B", TestId = "Test6" });
var trans = new Distinct<MyRow>();
var dest = new MemoryDestination<MyRow>();
var duplicateDest = new MemoryDestination<MyRow>();
source.LinkTo(trans);
trans.LinkTo(dest);
trans.LinkDuplicatesTo(duplicateDest);
Network.Execute(source);
foreach (var row in dest.Data)
Console.WriteLine($"Id:{row.Id} Value:{row.Value} TestId:{row.TestId}");
foreach (var row in duplicateDest.Data)
Console.WriteLine($"Duplicate - Id:{row.Id} Value:{row.Value} TestId:{row.TestId}");
//Output
/*
Id:1 Value:A TestId:Test1
Id:2 Value:A TestId:Test2
Id:2 Value:B TestId:Test3
Id:3 Value:B TestId:Test6
Duplicate - Id:1 Value:A TestId:Test4
Duplicate - Id:2 Value:A TestId:Test5
*/
Using Distinct with Dynamic Objects
Like all ETLBox data flow components, the Distinct component also facilitates the use of dynamic ExpandoObject
.
var source = new MemorySource();
source.DataAsList.Add(CreateDynamicRow(distinctCol1: 1, distinctCol2: "A", otherValue: "1"));
source.DataAsList.Add(CreateDynamicRow(distinctCol1: 2, distinctCol2: "B", otherValue: "5"));
source.DataAsList.Add(CreateDynamicRow(distinctCol1: 1, distinctCol2: "C", otherValue: "2"));
source.DataAsList.Add(CreateDynamicRow(distinctCol1: 1, distinctCol2: "A", otherValue: "3"));
source.DataAsList.Add(CreateDynamicRow(distinctCol1: 1, distinctCol2: "C", otherValue: "4"));
source.DataAsList.Add(CreateDynamicRow(distinctCol1: 2, distinctCol2: "B", otherValue: "5"));
source.DataAsList.Add(CreateDynamicRow(distinctCol1: 2, distinctCol2: "B", otherValue: "6"));
var trans = new Distinct();
trans.DistinctColumns = new DistinctColumn[]{
new DistinctColumn() { DistinctPropertyName = "DistinctCol1"},
new DistinctColumn() { DistinctPropertyName = "DistinctCol2" }
};
var dest = new MemoryDestination();
var destDuplicates1 = new MemoryDestination();
var destDuplicates2 = new MemoryDestination();
source.LinkTo(trans);
trans.LinkTo(dest);
trans.LinkDuplicatesTo(destDuplicates1, row => (row as dynamic).DistinctCol1 == 1);
trans.LinkDuplicatesTo(destDuplicates2, row => (row as dynamic).DistinctCol1 == 2);
Network.Execute(source);
foreach (dynamic row in dest.Data)
Console.WriteLine($"DistinctCol1:{row.DistinctCol1} DistinctCol2:{row.DistinctCol2} OtherValue:{row.OtherValue}");
foreach (dynamic row in destDuplicates1.Data)
Console.WriteLine($"Duplicate 1 - DistinctCol1:{row.DistinctCol1} DistinctCol2:{row.DistinctCol2} OtherValue:{row.OtherValue}");
foreach (dynamic row in destDuplicates2.Data)
Console.WriteLine($"Duplicate 2 - DistinctCol1:{row.DistinctCol1} DistinctCol2:{row.DistinctCol2} OtherValue:{row.OtherValue}");
//Output
/*
DistinctCol1:1 DistinctCol2:A OtherValue:1
DistinctCol1:2 DistinctCol2:B OtherValue:5
DistinctCol1:1 DistinctCol2:C OtherValue:2
Duplicate 1 - DistinctCol1:1 DistinctCol2:A OtherValue:3
Duplicate 1 - DistinctCol1:1 DistinctCol2:C OtherValue:4
Duplicate 2 - DistinctCol1:2 DistinctCol2:B OtherValue:5
Duplicate 2 - DistinctCol1:2 DistinctCol2:B OtherValue:6
*/
public ExpandoObject CreateDynamicRow(int distinctCol1, string distinctCol2, string otherValue) {
dynamic r = new ExpandoObject();
r.DistinctCol1 = distinctCol1;
r.DistinctCol2 = distinctCol2;
r.OtherValue = otherValue;
return r;
}
Custom Unique Key Function
Use the GetUniqueKeyFunc
property to define a custom logic for creating a unique key for each row.
var source = new MemorySource<MyRow>();
source.DataAsList.Add(new MyRow() { Id = 1, Value = "A", TestId = "Test1" });
source.DataAsList.Add(new MyRow() { Id = 1, Value = "B", TestId = "Test2" });
source.DataAsList.Add(new MyRow() { Id = 1, Value = "A_dupe", TestId = "Test3" });
source.DataAsList.Add(new MyRow() { Id = 1, Value = "b_dupe", TestId = "Test4" });
source.DataAsList.Add(new MyRow() { Id = 1, Value = "c", TestId = "Test5" });
source.DataAsList.Add(new MyRow() { Id = 1, Value = "a", TestId = "Test6" });
var trans = new Distinct<MyRow>();
trans.GetUniqueKeyFunc = (row) => row.Value.Substring(0, 1).ToLower();
var dest = new MemoryDestination<MyRow>();
var destDuplicates = new MemoryDestination<MyRow>();
source.LinkTo(trans);
trans.LinkTo(dest);
trans.LinkDuplicatesTo(destDuplicates);
Network.Execute(source);
foreach (var row in dest.Data)
Console.WriteLine($"Id:{row.Id} Value:{row.Value} TestId:{row.TestId}");
foreach (var row in destDuplicates.Data)
Console.WriteLine($"Duplicate - Id:{row.Id} Value:{row.Value} TestId:{row.TestId}");
//Output
/*
Id:1 Value:A TestId:Test1
Id:1 Value:B TestId:Test2
Id:1 Value:c TestId:Test5
Duplicate - Id:1 Value:A_dupe TestId:Test3
Duplicate - Id:1 Value:b_dupe TestId:Test4
Duplicate - Id:1 Value:a TestId:Test6
*/