From ff1f2ca27d76a8fe8c4ca240f958b4bbde93abe5 Mon Sep 17 00:00:00 2001
From: ollietulloch <ollie.tulloch@nhs.net>
Date: Thu, 28 Nov 2024 17:14:23 +0000
Subject: [PATCH 01/10] Allow regexp column names

---
 lib/ndr_import/table.rb | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/lib/ndr_import/table.rb b/lib/ndr_import/table.rb
index 2efd5dd..cea5acc 100644
--- a/lib/ndr_import/table.rb
+++ b/lib/ndr_import/table.rb
@@ -66,6 +66,7 @@ def process_line(line, &block)
       return enum_for(:process_line, line) unless block
 
       if @row_index < header_lines
+        mutate_regexp_columns(line)
         consume_header_line(line, @columns)
       else
         transform_line(line, @row_index, &block)
@@ -79,6 +80,15 @@ def process_line(line, &block)
       @notifier.try(:processed, @row_index)
     end
 
+    # Update 'column' values expressed as a regular expression
+    def mutate_regexp_columns(line)
+      @columns.each_with_index do |col, index|
+        next unless col['column'].is_a? Regexp
+
+        @columns[index]['column'] = line[index] if @columns[index]['column'] =~ line[index]
+      end
+    end
+
     # This method transforms an incoming line of data by applying each of the klass masked
     # mappings to the line and yielding the klass and fields for each mapped klass.
     def transform_line(line, index)
@@ -227,7 +237,7 @@ def header_message_for(missing, unexpected)
 
     # returns the column names as we expect to receive them
     def column_names(column_mappings)
-      column_mappings.map { |c| (c['column'] || c['standard_mapping']).downcase }
+      column_mappings.map { |c| (c['column'] || c['standard_mapping']).try(:downcase) }
     end
 
     # If specified in the mapping, stop transforming data at a given index (column)

From e37f95587f175e8b05713ab609e42054fc40914f Mon Sep 17 00:00:00 2001
From: ollietulloch <ollie.tulloch@nhs.net>
Date: Thu, 28 Nov 2024 17:14:36 +0000
Subject: [PATCH 02/10] Map column name to field

---
 lib/ndr_import/mapper.rb | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/lib/ndr_import/mapper.rb b/lib/ndr_import/mapper.rb
index a2be28e..8dc6eaf 100644
--- a/lib/ndr_import/mapper.rb
+++ b/lib/ndr_import/mapper.rb
@@ -32,6 +32,7 @@ module Strings
     VALIDATES        = 'validates'.freeze
     ZIP_ORDER        = 'zip_order'.freeze
     SPLIT_CHAR       = 'split_char'.freeze
+    MAP_COLUMNAME_TO = 'map_columname_to'.freeze
   end
 
   private
@@ -119,6 +120,10 @@ def mapped_line(line, line_mappings)
 
       # Store the raw column value
       rawtext[rawtext_column_name] = raw_value
+      if column_mapping['map_columname_to'].present?
+        data[column_mapping[Strings::MAP_COLUMNAME_TO]] ||= {}
+        data[column_mapping[Strings::MAP_COLUMNAME_TO]][:values] = [(column_mapping['column'])]
+      end
 
       next unless column_mapping.key?(Strings::MAPPINGS)
 

From b5addc505c8d9122cb746df10c12d052260a6ada Mon Sep 17 00:00:00 2001
From: ollietulloch <ollie.tulloch@nhs.net>
Date: Thu, 28 Nov 2024 18:25:45 +0000
Subject: [PATCH 03/10] Regexp column name testing

---
 test/table_test.rb | 44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/test/table_test.rb b/test/table_test.rb
index fd32813..3000bc5 100644
--- a/test/table_test.rb
+++ b/test/table_test.rb
@@ -569,6 +569,44 @@ def test_wrong_header_names
                  '["one", "three"] unexpected: ["fun", "tree"]', exception.message)
   end
 
+  test 'should mutate regexp column names' do
+    lines = [
+      %w[1234 STRING_HEADING ABC123],
+      %w[NUMRIC_ONLY STRING_VALUE ALPHA_NUMBERIC]
+    ].each
+
+    table = NdrImport::Table.new(
+      header_lines: 1,
+      footer_lines: 0,
+      klass: 'SomeTestKlass',
+      columns: regexp_column_names
+    )
+
+    expected_output = [
+      ['SomeTestKlass',
+       { rawtext: { '1234' => 'NUMRIC_ONLY', 'string_heading' => 'STRING_VALUE', 'abc123' => 'ALPHA_NUMBERIC' } },
+       1]
+    ]
+    assert_equal expected_output, table.transform(lines).to_a
+  end
+
+  test 'should report header errors is regexp column names do not match' do
+    lines = [
+      %w[A1234Z STRING_HEADING ABC123],
+      %w[NUMRIC_ONLY STRING_VALUE ALPHA_NUMBERIC]
+    ].each
+
+    table = NdrImport::Table.new(
+      header_lines: 1,
+      footer_lines: 0,
+      klass: 'SomeTestKlass',
+      columns: regexp_column_names
+    )
+
+    exception = assert_raises(RuntimeError) { table.transform(lines).to_a }
+    assert_equal 'Header is not valid! unexpected: ["a1234z"]', exception.message
+  end
+
   private
 
   def simple_deserialized_table
@@ -620,6 +658,12 @@ def column_level_klass_masked_mappings
     }
   end
 
+  def regexp_column_names
+    [{ 'column' => /\A\d+\z/ },
+     { 'column' => 'string_heading' },
+     { 'column' => /\A[A-Z]+\d{3}\z/i }]
+  end
+
   def get_yaml_mapping_order(yaml_mapping)
     yaml_mapping.split("\n").
       delete_if { |line| /-+/.match(line) }.

From d0cbdf1f61d28d544be9abed4a02539c355cfbc5 Mon Sep 17 00:00:00 2001
From: ollietulloch <ollie.tulloch@nhs.net>
Date: Thu, 28 Nov 2024 18:39:09 +0000
Subject: [PATCH 04/10] Store column name tests

---
 lib/ndr_import/mapper.rb |  7 +++++--
 test/mapper_test.rb      | 21 +++++++++++++++++++++
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/lib/ndr_import/mapper.rb b/lib/ndr_import/mapper.rb
index 8dc6eaf..347c499 100644
--- a/lib/ndr_import/mapper.rb
+++ b/lib/ndr_import/mapper.rb
@@ -120,9 +120,12 @@ def mapped_line(line, line_mappings)
 
       # Store the raw column value
       rawtext[rawtext_column_name] = raw_value
-      if column_mapping['map_columname_to'].present?
+
+      # If configured, store the column name in the given field
+      if column_mapping[Strings::MAP_COLUMNAME_TO].present?
         data[column_mapping[Strings::MAP_COLUMNAME_TO]] ||= {}
-        data[column_mapping[Strings::MAP_COLUMNAME_TO]][:values] = [(column_mapping['column'])]
+        data[column_mapping[Strings::MAP_COLUMNAME_TO]][:values] = [column_mapping['column']]
+        rawtext[column_mapping[Strings::MAP_COLUMNAME_TO]] = column_mapping['column']
       end
 
       next unless column_mapping.key?(Strings::MAPPINGS)
diff --git a/test/mapper_test.rb b/test/mapper_test.rb
index edfec1d..9fb1fcf 100644
--- a/test/mapper_test.rb
+++ b/test/mapper_test.rb
@@ -335,6 +335,16 @@ def setup
       - field: field_two
   YML
 
+  map_colum_name_to_field_mapping = YAML.safe_load <<-YML
+    - column: column_one
+      mappings:
+      - field: field_one
+    - column: abc123
+      map_columname_to: 'columnname_field'
+      mappings:
+      - field: field_two
+  YML
+
   test 'map should return a number' do
     assert_equal '1', TestMapper.new.mapped_value('A', map_mapping)
   end
@@ -728,4 +738,15 @@ def setup
       TestMapper.new.mapped_line(['A'], invalid_decode_mapping)
     end
   end
+
+  test 'should map column name to field' do
+    mapped_line = TestMapper.new.mapped_line(%w[one two], map_colum_name_to_field_mapping)
+    expected_mapped_line = {
+      'field_one' => 'one',
+      'columnname_field' => 'abc123',
+      'field_two' => 'two',
+      rawtext: { 'column_one' => 'one', 'abc123' => 'two', 'columnname_field' => 'abc123' }
+    }
+    assert_equal expected_mapped_line, mapped_line
+  end
 end

From 56fe1d1360aacf7c37778f76ad6fe1221080250a Mon Sep 17 00:00:00 2001
From: ollietulloch <ollie.tulloch@nhs.net>
Date: Thu, 28 Nov 2024 20:02:29 +0000
Subject: [PATCH 05/10] Documentation

---
 docs/capturing-column-names.md  | 37 +++++++++++++++++++++++++++
 docs/regexp-column-names.md     | 44 +++++++++++++++++++++++++++++++++
 docs/yaml-mapping-user-guide.md |  4 ++-
 3 files changed, 84 insertions(+), 1 deletion(-)
 create mode 100644 docs/capturing-column-names.md
 create mode 100644 docs/regexp-column-names.md

diff --git a/docs/capturing-column-names.md b/docs/capturing-column-names.md
new file mode 100644
index 0000000..1f40667
--- /dev/null
+++ b/docs/capturing-column-names.md
@@ -0,0 +1,37 @@
+---
+layout: page
+title: Capturing Column Names in Mapped Data
+permalink: /capturing-column-names/
+---
+
+Column names themselves may contain data that should be included in each record. For example VCF files have a column name that is a Lab Number and it should be included on all records.
+
+In order to store the column name in each record, include the `map_columname_to` key at the column level, with the value being the desired field and rawtext name.
+
+
+Example mapping
+
+---
+    - column: column_one
+      mappings:
+      - field: field_one
+    - column: abc123
+      map_columname_to: 'columnname_field'
+      mappings:
+      - field: field_two
+
+Example data:
+
+```
+"column_one","abc123"
+"one","two"
+```
+
+This would result in:
+
+```
+{ 'field_one' => 'one',
+  'columnname_field' => 'abc123',
+  'field_two' => 'two',
+  rawtext: { 'column_one' => 'one', 'abc123' => 'two', 'columnname_field' => 'abc123' } }
+```
diff --git a/docs/regexp-column-names.md b/docs/regexp-column-names.md
new file mode 100644
index 0000000..a9b8e9a
--- /dev/null
+++ b/docs/regexp-column-names.md
@@ -0,0 +1,44 @@
+---
+layout: page
+title: Regular Expression Column Names
+permalink: /regexp-column-names/
+---
+
+Column names may differ between files, for example a lab number might be used a column name. In order to map the column, the column name can be a regular expression.
+
+If the regular expression matches the column name in the raw file, the data will be mapped and loaded as expected.
+
+If the regular expected does not match the column name, a column header error will be raised.
+
+Example mapping
+
+---
+    - column: /\A[A-Z]+\d{3}\z/i
+      mappings:
+      - field: regex_field
+    - column: two
+      mappings:
+      - field: two
+
+Example data:
+
+```
+"abc123","two"
+"regex_value","string_value"
+```
+
+This would result in:
+
+```
+{ 'regex_field' => 'regex_value', 'two' => 'string_value' },
+   rawtext: { 'regex_field' => 'regex_value', 'two' => 'string_value' } }
+```
+
+However, the below data:
+
+```
+"1234abc","two"
+"regex_value,string_value"
+```
+
+would result in a RuntimeError: 'Header is not valid! unexpected: ["1234abc"]'
diff --git a/docs/yaml-mapping-user-guide.md b/docs/yaml-mapping-user-guide.md
index ccf5d10..86b76ae 100644
--- a/docs/yaml-mapping-user-guide.md
+++ b/docs/yaml-mapping-user-guide.md
@@ -13,4 +13,6 @@ add_to_nav: true
 6. [Non Tabular Mappings](non-tabular-mappings.md)
 7. [Date Formats](date-formats.md)
 8. [XML mappings](xml-mappings.md)
-9. [Zipped Field Mapping](priority-field-mapping.md)
\ No newline at end of file
+9. [Zipped Field Mapping](zipped-field-mapping.md)
+10. [Regular Expression Column Names](regexp-column-names.md)
+11. [Capturing Column Names in Mapped Data](capturing-column-names.md)
\ No newline at end of file

From c63711338e7838efc8101c1bf71504567772a94d Mon Sep 17 00:00:00 2001
From: ollietulloch <ollie.tulloch@nhs.net>
Date: Thu, 28 Nov 2024 20:03:38 +0000
Subject: [PATCH 06/10] Changelog

---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7ae8ca1..dcc5546 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,8 @@
 ## [Unreleased]
 ### Added
 * Column zipping functionality *
+* Capturing Column name *
+* Regular expression column names *
 
 ## 11.2.1 / 2024-11-18
 ### Fixed

From 877ab4037ed37a79945a43c03bb2e5eda540253b Mon Sep 17 00:00:00 2001
From: ollietulloch <ollie.tulloch@nhs.net>
Date: Thu, 28 Nov 2024 20:17:44 +0000
Subject: [PATCH 07/10] Add a slightly convoluted test

---
 test/vcf/table_test.rb | 38 ++++++++++++++++++++++++--------------
 1 file changed, 24 insertions(+), 14 deletions(-)

diff --git a/test/vcf/table_test.rb b/test/vcf/table_test.rb
index a8c32d8..6b0aac8 100644
--- a/test/vcf/table_test.rb
+++ b/test/vcf/table_test.rb
@@ -20,18 +20,28 @@ def setup
     test 'should transform avro lines' do
       table = NdrImport::Vcf::Table.new(klass: 'SomeTestKlass', columns: vcf_column_mapping)
 
-      expected_data = ['SomeTestKlass', { rawtext: {
-        '#chrom'  => '1',
-        'pos'     => '26387783',
-        'id'      => '.',
-        'ref'     => 'G',
-        'alt'     => 'A',
-        'qual'    => '847.77',
-        'filter'  => 'PASS',
-        'info'    => 'AC=1;AF=0.500;AN=2;DP=85;set=Intersection',
-        'format'  => 'GT:AD:DP:GQ:PL:SAC',
-        'sample1' => '0/1:52,32:84:99:876,0,1277:21,31,14,18'
-      } }, 1]
+      expected_data = ['SomeTestKlass',
+                       { 'zipped_field' =>
+                         [%w[GT 0/1],
+                          %w[AD 52,32],
+                          %w[DP 84],
+                          %w[GQ 99],
+                          %w[PL 876,0,1277],
+                          %w[SAC 21,31,14,18]],
+                         'lab_number' => 'Sample1',
+                         rawtext:
+                         { '#chrom'     => '1',
+                           'pos'        => '26387783',
+                           'id'         => '.',
+                           'ref'        => 'G',
+                           'alt'        => 'A',
+                           'qual'       => '847.77',
+                           'filter'     => 'PASS',
+                           'info'       => 'AC=1;AF=0.500;AN=2;DP=85;set=Intersection',
+                           'format'     => 'GT:AD:DP:GQ:PL:SAC',
+                           'sample1'    => '0/1:52,32:84:99:876,0,1277:21,31,14,18',
+                           'lab_number' => 'Sample1' } },
+                       1]
 
       transformed_data = table.transform(@rows)
       assert_equal 6, transformed_data.count
@@ -65,8 +75,8 @@ def vcf_column_mapping
        { 'column' => 'qual' },
        { 'column' => 'filter' },
        { 'column' => 'info' },
-       { 'column' => 'format' },
-       { 'column' => 'sample1' }]
+       { 'column' => 'format', 'mappings' => ['field' => 'zipped_field', 'zip_order' => 1, 'split_char' => /[:;]/] },
+       { 'column' => /sample\d+/i, 'map_columname_to' => 'lab_number', 'mappings' => ['field' => 'zipped_field', 'zip_order' => 2] }]
     end
 
     def unexpected_columns_mapping

From 15b7b17e1605d4c62a0318ae72c18805ef8d9bbc Mon Sep 17 00:00:00 2001
From: ollietulloch <ollie.tulloch@nhs.net>
Date: Thu, 28 Nov 2024 20:35:31 +0000
Subject: [PATCH 08/10] Mutate regex column names while iterating

---
 lib/ndr_import/table.rb | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/ndr_import/table.rb b/lib/ndr_import/table.rb
index cea5acc..0eed1e4 100644
--- a/lib/ndr_import/table.rb
+++ b/lib/ndr_import/table.rb
@@ -82,10 +82,10 @@ def process_line(line, &block)
 
     # Update 'column' values expressed as a regular expression
     def mutate_regexp_columns(line)
-      @columns.each_with_index do |col, index|
-        next unless col['column'].is_a? Regexp
+      @columns.each_with_index do |column, index|
+        next unless column['column'].is_a? Regexp
 
-        @columns[index]['column'] = line[index] if @columns[index]['column'] =~ line[index]
+        column['column'] = line[index] if column['column'] =~ line[index]
       end
     end
 

From 8754a17dddd2730c24748a3a7de7daf5649fd145 Mon Sep 17 00:00:00 2001
From: ollietulloch <ollie.tulloch@nhs.net>
Date: Fri, 29 Nov 2024 09:45:28 +0000
Subject: [PATCH 09/10] Use `match?` instead of `=~` as a conditional

---
 lib/ndr_import/table.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/ndr_import/table.rb b/lib/ndr_import/table.rb
index 0eed1e4..27711d3 100644
--- a/lib/ndr_import/table.rb
+++ b/lib/ndr_import/table.rb
@@ -85,7 +85,7 @@ def mutate_regexp_columns(line)
       @columns.each_with_index do |column, index|
         next unless column['column'].is_a? Regexp
 
-        column['column'] = line[index] if column['column'] =~ line[index]
+        column['column'] = line[index] if line[index].match? column['column']
       end
     end
 

From ebfffe9ac1aac88127bb41c37d2435e54c2da4e0 Mon Sep 17 00:00:00 2001
From: ollietulloch <ollie.tulloch@nhs.net>
Date: Tue, 3 Dec 2024 11:40:23 +0000
Subject: [PATCH 10/10] typos

---
 test/table_test.rb | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/table_test.rb b/test/table_test.rb
index 3000bc5..779fb43 100644
--- a/test/table_test.rb
+++ b/test/table_test.rb
@@ -572,7 +572,7 @@ def test_wrong_header_names
   test 'should mutate regexp column names' do
     lines = [
       %w[1234 STRING_HEADING ABC123],
-      %w[NUMRIC_ONLY STRING_VALUE ALPHA_NUMBERIC]
+      %w[NUMERIC_ONLY STRING_VALUE ALPHA_NUMERIC]
     ].each
 
     table = NdrImport::Table.new(
@@ -584,7 +584,7 @@ def test_wrong_header_names
 
     expected_output = [
       ['SomeTestKlass',
-       { rawtext: { '1234' => 'NUMRIC_ONLY', 'string_heading' => 'STRING_VALUE', 'abc123' => 'ALPHA_NUMBERIC' } },
+       { rawtext: { '1234' => 'NUMERIC_ONLY', 'string_heading' => 'STRING_VALUE', 'abc123' => 'ALPHA_NUMERIC' } },
        1]
     ]
     assert_equal expected_output, table.transform(lines).to_a
@@ -593,7 +593,7 @@ def test_wrong_header_names
   test 'should report header errors is regexp column names do not match' do
     lines = [
       %w[A1234Z STRING_HEADING ABC123],
-      %w[NUMRIC_ONLY STRING_VALUE ALPHA_NUMBERIC]
+      %w[NUMERIC_ONLY STRING_VALUE ALPHA_NUMERIC]
     ].each
 
     table = NdrImport::Table.new(