My solution involves the use of the DataTable class, since they display the .csv data well when using the WPF DataGrid.
What needs to happen to fix the data of a given column:
- Remove any empty entries
- Remove any duplicates
- Replacing invalid entries with valid entries
- Repeat steps 2 and 3 untill the data is in the format you want it to be.
I ended up creating an extension class for the DataTable to tackle the above steps:
public static class MyDataTableExtensions
{
/// <summary>
/// Replaces the values in all rows in columnName using RegEx, can optionally remove any duplicate values as well.
/// </summary>
/// <param name="dt"></param>
/// <param name="columnName"></param>
/// <param name="pattern"></param>
/// <param name="replacement"></param>
/// <param name="removeDuplicates"></param>
/// <returns></returns>
public static DataTable ReplaceDataTableRows(this DataTable dt, string columnName, string pattern, string replacement, bool removeDuplicates = false)
{
if (dt is null || dt.AsEnumerable().Any() is false) throw new ArgumentException($"The DataTable '{ nameof(dt) }' does not contain any elements. Please make sure that the provided DataTable is not null or empty.");
if (string.IsNullOrEmpty(columnName)) throw new ArgumentNullException(nameof(columnName));
if (string.IsNullOrEmpty(pattern)) throw new ArgumentNullException(nameof(pattern));
DataTable output = dt.Copy();
Regex regex = new Regex(pattern);
string item;
foreach (DataRow row in output.Rows)
{
foreach (DataColumn column in output.Columns)
{
if (column.ColumnName.ToLower().Equals(columnName.ToLower()))
{
if (column.ReadOnly) column.ReadOnly = false;
item = regex.Replace(row.Field<object>(column).ToString(), replacement);
if (string.IsNullOrEmpty(item))
{
row.Delete();
continue;
}
row.SetField(column, item);
}
}
}
output.AcceptChanges();
return removeDuplicates ? RemoveDuplicatesFromDataTableColumn(output, columnName) : output;
}
/// <summary>
/// Removes any empty values found in the rows in the given column
/// </summary>
/// <param name="dt"></param>
/// <param name="columnName"></param>
/// <returns></returns>
public static DataTable RemoveEmptyValuesFromDataTableColumn(this DataTable dt, string columnName)
{
DataTable output = dt.Copy();
foreach (DataRow row in output.Rows)
{
foreach (DataColumn column in output.Columns)
{
if (column.ColumnName.Equals(columnName))
{
if (column.ReadOnly) column.ReadOnly = false;
if (string.IsNullOrEmpty(row.Field<object>(column).ToString()))
{
row.Delete();
}
}
}
}
output.AcceptChanges();
return output;
}
/// <summary>
/// Removes any duplicate rows found in the column with 'columnName'
/// </summary>
/// <param name="dt"></param>
/// <param name="columnName"></param>
/// <returns></returns>
public static DataTable RemoveDuplicatesFromDataTableColumn(this DataTable dt, string columnName)
{
if (dt is null || dt.AsEnumerable().Any() is false) throw new ArgumentException($"The DataTable '{ nameof(dt) }' does not contain any elements. Please make sure that the provided DataTable is not null or empty.");
if (string.IsNullOrEmpty(columnName)) throw new ArgumentNullException(nameof(columnName));
return dt.AsEnumerable().GroupBy(x => x.Field<object>(columnName))
.Select(x => x.First()).CopyToDataTable();
}
/// <summary>
/// Prepends the valueToPrepend to the row in columnName if the value in the row does NOT match the given pattern
/// </summary>
/// <param name="dt"></param>
/// <param name="columnName"></param>
/// <param name="pattern"></param>
/// <param name="valueToPrepend"></param>
/// <returns></returns>
public static DataTable PrependValueToDataTableColumn(this DataTable dt, string columnName, string pattern, string valueToPrepend)
{
if (dt is null || dt.AsEnumerable().Any() is false) throw new ArgumentException($"The DataTable '{ nameof(dt) }' does not contain any elements. Please make sure that the provided DataTable is not null or empty.");
if (string.IsNullOrEmpty(columnName)) throw new ArgumentNullException(nameof(columnName));
if (string.IsNullOrEmpty(pattern)) throw new ArgumentNullException(nameof(pattern));
if (string.IsNullOrEmpty(valueToPrepend)) throw new ArgumentNullException(nameof(valueToPrepend));
DataTable output = dt.Copy();
Regex regex = new Regex(pattern);
string item;
foreach (DataRow row in output.Rows)
{
foreach (DataColumn column in output.Columns)
{
if (column.ColumnName.ToLower().Equals(columnName.ToLower()))
{
if (column.ReadOnly) column.ReadOnly = false;
item = row.Field<object>(column).ToString();
if (regex.IsMatch(item) == false)
{
row.SetField(column, $"{ valueToPrepend }{ item }");
}
}
}
}
output.AcceptChanges();
return output;
}
}
After that, for my data-set, the solution is actually relatively simple:
private void Button_Click(object sender, RoutedEventArgs e)
{
// eventually the column will be selectable through the use of a ComboBox
var selection = cb.SelectedItem;
string columnName = selection is not null ? cb.SelectedItem.ToString() : "Objectgebondennummer";
if (!string.IsNullOrEmpty(columnName))
{
// solution:
var resultDt = dt.Copy();
resultDt = resultDt.RemoveEmptyValuesFromDataTableColumn(columnName) // step 1
.RemoveDuplicatesFromDataTableColumn(columnName) // step 2
.ReplaceDataTableRows(columnName, "[^0-9]", string.Empty, true) // at this point we just have rows with mixed values, remove any non-numeric characters from the rows
.ReplaceDataTableRows(columnName, "^0", "31", true) // now we just have numbers (yay), first I replaced all the values starting with '0' to start with '31' instead
.PrependValueToDataTableColumn(columnName, "^31", "31") // my data was luckily pretty clean at this point, all that remained were numbers in the format "612345678" and "97123456789", so I just had to prepend 31 to them
.RemoveDuplicatesFromDataTableColumn(columnName); // make sure we don't have any new duplicates
dataGrid.ItemsSource = resultDt.AsDataView(); // finally, update the datagrid with the fixed data!
}
}