Title here
Summary here
This article contains example code snippets that show how to use the Distinct component.
DistinctColumnThis example demonstrates using the Distinct component to eliminate duplicate records in a data flow. The DistinctColumn attribute specifies which properties to use for identifying duplicates.
public class MyRow
{
[DistinctColumn]
public int Id { get; set; }
[DistinctColumn]
public string Value { get; set; }
public string TestId { get; set; }
}
var source = new MemorySource<MyRow>();
source.DataAsList.Add(new MyRow() { Id = 1, Value = "A", TestId = "Test1" });
source.DataAsList.Add(new MyRow() { Id = 2, Value = "A", TestId = "Test2" });
source.DataAsList.Add(new MyRow() { Id = 2, Value = "B", TestId = "Test3" });
source.DataAsList.Add(new MyRow() { Id = 1, Value = "A", TestId = "Test4" });
source.DataAsList.Add(new MyRow() { Id = 2, Value = "A", TestId = "Test5" });
source.DataAsList.Add(new MyRow() { Id = 3, Value = "B", TestId = "Test6" });
var trans = new Distinct<MyRow>();
var dest = new MemoryDestination<MyRow>();
source.LinkTo(trans);
trans.LinkTo(dest);
Network.Execute(source);
foreach (var row in dest.Data)
Console.WriteLine($"Id:{row.Id} Value:{row.Value} TestId:{row.TestId}");
//Output
/*
Id:1 Value:A TestId:Test1
Id:2 Value:A TestId:Test2
Id:2 Value:B TestId:Test3
Id:3 Value:B TestId:Test6
*/This example illustrates how to use the LinkDuplicatesTo method to send duplicates to a different data flow.
var source = new MemorySource<MyRow>();
source.DataAsList.Add(new MyRow() { Id = 1, Value = "A", TestId = "Test1" });
source.DataAsList.Add(new MyRow() { Id = 2, Value = "A", TestId = "Test2" });
source.DataAsList.Add(new MyRow() { Id = 2, Value = "B", TestId = "Test3" });
source.DataAsList.Add(new MyRow() { Id = 1, Value = "A", TestId = "Test4" });
source.DataAsList.Add(new MyRow() { Id = 2, Value = "A", TestId = "Test5" });
source.DataAsList.Add(new MyRow() { Id = 3, Value = "B", TestId = "Test6" });
var trans = new Distinct<MyRow>();
var dest = new MemoryDestination<MyRow>();
var duplicateDest = new MemoryDestination<MyRow>();
source.LinkTo(trans);
trans.LinkTo(dest);
trans.LinkDuplicatesTo(duplicateDest);
Network.Execute(source);
foreach (var row in dest.Data)
Console.WriteLine($"Id:{row.Id} Value:{row.Value} TestId:{row.TestId}");
foreach (var row in duplicateDest.Data)
Console.WriteLine($"Duplicate - Id:{row.Id} Value:{row.Value} TestId:{row.TestId}");
//Output
/*
Id:1 Value:A TestId:Test1
Id:2 Value:A TestId:Test2
Id:2 Value:B TestId:Test3
Id:3 Value:B TestId:Test6
Duplicate - Id:1 Value:A TestId:Test4
Duplicate - Id:2 Value:A TestId:Test5
*/Like all ETLBox data flow components, the Distinct component also facilitates the use of dynamic ExpandoObject.
var source = new MemorySource();
source.DataAsList.Add(CreateDynamicRow(distinctCol1: 1, distinctCol2: "A", otherValue: "1"));
source.DataAsList.Add(CreateDynamicRow(distinctCol1: 2, distinctCol2: "B", otherValue: "5"));
source.DataAsList.Add(CreateDynamicRow(distinctCol1: 1, distinctCol2: "C", otherValue: "2"));
source.DataAsList.Add(CreateDynamicRow(distinctCol1: 1, distinctCol2: "A", otherValue: "3"));
source.DataAsList.Add(CreateDynamicRow(distinctCol1: 1, distinctCol2: "C", otherValue: "4"));
source.DataAsList.Add(CreateDynamicRow(distinctCol1: 2, distinctCol2: "B", otherValue: "5"));
source.DataAsList.Add(CreateDynamicRow(distinctCol1: 2, distinctCol2: "B", otherValue: "6"));
var trans = new Distinct();
trans.DistinctColumns = new DistinctColumn[]{
new DistinctColumn() { DistinctPropertyName = "DistinctCol1"},
new DistinctColumn() { DistinctPropertyName = "DistinctCol2" }
};
var dest = new MemoryDestination();
var destDuplicates1 = new MemoryDestination();
var destDuplicates2 = new MemoryDestination();
source.LinkTo(trans);
trans.LinkTo(dest);
trans.LinkDuplicatesTo(destDuplicates1, row => (row as dynamic).DistinctCol1 == 1);
trans.LinkDuplicatesTo(destDuplicates2, row => (row as dynamic).DistinctCol1 == 2);
Network.Execute(source);
foreach (dynamic row in dest.Data)
Console.WriteLine($"DistinctCol1:{row.DistinctCol1} DistinctCol2:{row.DistinctCol2} OtherValue:{row.OtherValue}");
foreach (dynamic row in destDuplicates1.Data)
Console.WriteLine($"Duplicate 1 - DistinctCol1:{row.DistinctCol1} DistinctCol2:{row.DistinctCol2} OtherValue:{row.OtherValue}");
foreach (dynamic row in destDuplicates2.Data)
Console.WriteLine($"Duplicate 2 - DistinctCol1:{row.DistinctCol1} DistinctCol2:{row.DistinctCol2} OtherValue:{row.OtherValue}");
//Output
/*
DistinctCol1:1 DistinctCol2:A OtherValue:1
DistinctCol1:2 DistinctCol2:B OtherValue:5
DistinctCol1:1 DistinctCol2:C OtherValue:2
Duplicate 1 - DistinctCol1:1 DistinctCol2:A OtherValue:3
Duplicate 1 - DistinctCol1:1 DistinctCol2:C OtherValue:4
Duplicate 2 - DistinctCol1:2 DistinctCol2:B OtherValue:5
Duplicate 2 - DistinctCol1:2 DistinctCol2:B OtherValue:6
*/
public ExpandoObject CreateDynamicRow(int distinctCol1, string distinctCol2, string otherValue) {
dynamic r = new ExpandoObject();
r.DistinctCol1 = distinctCol1;
r.DistinctCol2 = distinctCol2;
r.OtherValue = otherValue;
return r;
}Use the GetUniqueKeyFunc property to define a custom logic for creating a unique key for each row.
var source = new MemorySource<MyRow>();
source.DataAsList.Add(new MyRow() { Id = 1, Value = "A", TestId = "Test1" });
source.DataAsList.Add(new MyRow() { Id = 1, Value = "B", TestId = "Test2" });
source.DataAsList.Add(new MyRow() { Id = 1, Value = "A_dupe", TestId = "Test3" });
source.DataAsList.Add(new MyRow() { Id = 1, Value = "b_dupe", TestId = "Test4" });
source.DataAsList.Add(new MyRow() { Id = 1, Value = "c", TestId = "Test5" });
source.DataAsList.Add(new MyRow() { Id = 1, Value = "a", TestId = "Test6" });
var trans = new Distinct<MyRow>();
trans.GetUniqueKeyFunc = (row) => row.Value.Substring(0, 1).ToLower();
var dest = new MemoryDestination<MyRow>();
var destDuplicates = new MemoryDestination<MyRow>();
source.LinkTo(trans);
trans.LinkTo(dest);
trans.LinkDuplicatesTo(destDuplicates);
Network.Execute(source);
foreach (var row in dest.Data)
Console.WriteLine($"Id:{row.Id} Value:{row.Value} TestId:{row.TestId}");
foreach (var row in destDuplicates.Data)
Console.WriteLine($"Duplicate - Id:{row.Id} Value:{row.Value} TestId:{row.TestId}");
//Output
/*
Id:1 Value:A TestId:Test1
Id:1 Value:B TestId:Test2
Id:1 Value:c TestId:Test5
Duplicate - Id:1 Value:A_dupe TestId:Test3
Duplicate - Id:1 Value:b_dupe TestId:Test4
Duplicate - Id:1 Value:a TestId:Test6
*/