From ff1f2ca27d76a8fe8c4ca240f958b4bbde93abe5 Mon Sep 17 00:00:00 2001 From: ollietulloch Date: Thu, 28 Nov 2024 17:14:23 +0000 Subject: [PATCH 01/10] Allow regexp column names --- lib/ndr_import/table.rb | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/lib/ndr_import/table.rb b/lib/ndr_import/table.rb index 2efd5dd..cea5acc 100644 --- a/lib/ndr_import/table.rb +++ b/lib/ndr_import/table.rb @@ -66,6 +66,7 @@ def process_line(line, &block) return enum_for(:process_line, line) unless block if @row_index < header_lines + mutate_regexp_columns(line) consume_header_line(line, @columns) else transform_line(line, @row_index, &block) @@ -79,6 +80,15 @@ def process_line(line, &block) @notifier.try(:processed, @row_index) end + # Update 'column' values expressed as a regular expression + def mutate_regexp_columns(line) + @columns.each_with_index do |col, index| + next unless col['column'].is_a? Regexp + + @columns[index]['column'] = line[index] if @columns[index]['column'] =~ line[index] + end + end + # This method transforms an incoming line of data by applying each of the klass masked # mappings to the line and yielding the klass and fields for each mapped klass. def transform_line(line, index) @@ -227,7 +237,7 @@ def header_message_for(missing, unexpected) # returns the column names as we expect to receive them def column_names(column_mappings) - column_mappings.map { |c| (c['column'] || c['standard_mapping']).downcase } + column_mappings.map { |c| (c['column'] || c['standard_mapping']).try(:downcase) } end # If specified in the mapping, stop transforming data at a given index (column) From e37f95587f175e8b05713ab609e42054fc40914f Mon Sep 17 00:00:00 2001 From: ollietulloch Date: Thu, 28 Nov 2024 17:14:36 +0000 Subject: [PATCH 02/10] Map column name to field --- lib/ndr_import/mapper.rb | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lib/ndr_import/mapper.rb b/lib/ndr_import/mapper.rb index a2be28e..8dc6eaf 100644 --- a/lib/ndr_import/mapper.rb +++ b/lib/ndr_import/mapper.rb @@ -32,6 +32,7 @@ module Strings VALIDATES = 'validates'.freeze ZIP_ORDER = 'zip_order'.freeze SPLIT_CHAR = 'split_char'.freeze + MAP_COLUMNAME_TO = 'map_columname_to'.freeze end private @@ -119,6 +120,10 @@ def mapped_line(line, line_mappings) # Store the raw column value rawtext[rawtext_column_name] = raw_value + if column_mapping['map_columname_to'].present? + data[column_mapping[Strings::MAP_COLUMNAME_TO]] ||= {} + data[column_mapping[Strings::MAP_COLUMNAME_TO]][:values] = [(column_mapping['column'])] + end next unless column_mapping.key?(Strings::MAPPINGS) From b5addc505c8d9122cb746df10c12d052260a6ada Mon Sep 17 00:00:00 2001 From: ollietulloch Date: Thu, 28 Nov 2024 18:25:45 +0000 Subject: [PATCH 03/10] Regexp column name testing --- test/table_test.rb | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/test/table_test.rb b/test/table_test.rb index fd32813..3000bc5 100644 --- a/test/table_test.rb +++ b/test/table_test.rb @@ -569,6 +569,44 @@ def test_wrong_header_names '["one", "three"] unexpected: ["fun", "tree"]', exception.message) end + test 'should mutate regexp column names' do + lines = [ + %w[1234 STRING_HEADING ABC123], + %w[NUMRIC_ONLY STRING_VALUE ALPHA_NUMBERIC] + ].each + + table = NdrImport::Table.new( + header_lines: 1, + footer_lines: 0, + klass: 'SomeTestKlass', + columns: regexp_column_names + ) + + expected_output = [ + ['SomeTestKlass', + { rawtext: { '1234' => 'NUMRIC_ONLY', 'string_heading' => 'STRING_VALUE', 'abc123' => 'ALPHA_NUMBERIC' } }, + 1] + ] + assert_equal expected_output, table.transform(lines).to_a + end + + test 'should report header errors is regexp column names do not match' do + lines = [ + %w[A1234Z STRING_HEADING ABC123], + %w[NUMRIC_ONLY STRING_VALUE ALPHA_NUMBERIC] + ].each + + table = NdrImport::Table.new( + header_lines: 1, + footer_lines: 0, + klass: 'SomeTestKlass', + columns: regexp_column_names + ) + + exception = assert_raises(RuntimeError) { table.transform(lines).to_a } + assert_equal 'Header is not valid! unexpected: ["a1234z"]', exception.message + end + private def simple_deserialized_table @@ -620,6 +658,12 @@ def column_level_klass_masked_mappings } end + def regexp_column_names + [{ 'column' => /\A\d+\z/ }, + { 'column' => 'string_heading' }, + { 'column' => /\A[A-Z]+\d{3}\z/i }] + end + def get_yaml_mapping_order(yaml_mapping) yaml_mapping.split("\n"). delete_if { |line| /-+/.match(line) }. From d0cbdf1f61d28d544be9abed4a02539c355cfbc5 Mon Sep 17 00:00:00 2001 From: ollietulloch Date: Thu, 28 Nov 2024 18:39:09 +0000 Subject: [PATCH 04/10] Store column name tests --- lib/ndr_import/mapper.rb | 7 +++++-- test/mapper_test.rb | 21 +++++++++++++++++++++ 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/lib/ndr_import/mapper.rb b/lib/ndr_import/mapper.rb index 8dc6eaf..347c499 100644 --- a/lib/ndr_import/mapper.rb +++ b/lib/ndr_import/mapper.rb @@ -120,9 +120,12 @@ def mapped_line(line, line_mappings) # Store the raw column value rawtext[rawtext_column_name] = raw_value - if column_mapping['map_columname_to'].present? + + # If configured, store the column name in the given field + if column_mapping[Strings::MAP_COLUMNAME_TO].present? data[column_mapping[Strings::MAP_COLUMNAME_TO]] ||= {} - data[column_mapping[Strings::MAP_COLUMNAME_TO]][:values] = [(column_mapping['column'])] + data[column_mapping[Strings::MAP_COLUMNAME_TO]][:values] = [column_mapping['column']] + rawtext[column_mapping[Strings::MAP_COLUMNAME_TO]] = column_mapping['column'] end next unless column_mapping.key?(Strings::MAPPINGS) diff --git a/test/mapper_test.rb b/test/mapper_test.rb index edfec1d..9fb1fcf 100644 --- a/test/mapper_test.rb +++ b/test/mapper_test.rb @@ -335,6 +335,16 @@ def setup - field: field_two YML + map_colum_name_to_field_mapping = YAML.safe_load <<-YML + - column: column_one + mappings: + - field: field_one + - column: abc123 + map_columname_to: 'columnname_field' + mappings: + - field: field_two + YML + test 'map should return a number' do assert_equal '1', TestMapper.new.mapped_value('A', map_mapping) end @@ -728,4 +738,15 @@ def setup TestMapper.new.mapped_line(['A'], invalid_decode_mapping) end end + + test 'should map column name to field' do + mapped_line = TestMapper.new.mapped_line(%w[one two], map_colum_name_to_field_mapping) + expected_mapped_line = { + 'field_one' => 'one', + 'columnname_field' => 'abc123', + 'field_two' => 'two', + rawtext: { 'column_one' => 'one', 'abc123' => 'two', 'columnname_field' => 'abc123' } + } + assert_equal expected_mapped_line, mapped_line + end end From 56fe1d1360aacf7c37778f76ad6fe1221080250a Mon Sep 17 00:00:00 2001 From: ollietulloch Date: Thu, 28 Nov 2024 20:02:29 +0000 Subject: [PATCH 05/10] Documentation --- docs/capturing-column-names.md | 37 +++++++++++++++++++++++++++ docs/regexp-column-names.md | 44 +++++++++++++++++++++++++++++++++ docs/yaml-mapping-user-guide.md | 4 ++- 3 files changed, 84 insertions(+), 1 deletion(-) create mode 100644 docs/capturing-column-names.md create mode 100644 docs/regexp-column-names.md diff --git a/docs/capturing-column-names.md b/docs/capturing-column-names.md new file mode 100644 index 0000000..1f40667 --- /dev/null +++ b/docs/capturing-column-names.md @@ -0,0 +1,37 @@ +--- +layout: page +title: Capturing Column Names in Mapped Data +permalink: /capturing-column-names/ +--- + +Column names themselves may contain data that should be included in each record. For example VCF files have a column name that is a Lab Number and it should be included on all records. + +In order to store the column name in each record, include the `map_columname_to` key at the column level, with the value being the desired field and rawtext name. + + +Example mapping + +--- + - column: column_one + mappings: + - field: field_one + - column: abc123 + map_columname_to: 'columnname_field' + mappings: + - field: field_two + +Example data: + +``` +"column_one","abc123" +"one","two" +``` + +This would result in: + +``` +{ 'field_one' => 'one', + 'columnname_field' => 'abc123', + 'field_two' => 'two', + rawtext: { 'column_one' => 'one', 'abc123' => 'two', 'columnname_field' => 'abc123' } } +``` diff --git a/docs/regexp-column-names.md b/docs/regexp-column-names.md new file mode 100644 index 0000000..a9b8e9a --- /dev/null +++ b/docs/regexp-column-names.md @@ -0,0 +1,44 @@ +--- +layout: page +title: Regular Expression Column Names +permalink: /regexp-column-names/ +--- + +Column names may differ between files, for example a lab number might be used a column name. In order to map the column, the column name can be a regular expression. + +If the regular expression matches the column name in the raw file, the data will be mapped and loaded as expected. + +If the regular expected does not match the column name, a column header error will be raised. + +Example mapping + +--- + - column: /\A[A-Z]+\d{3}\z/i + mappings: + - field: regex_field + - column: two + mappings: + - field: two + +Example data: + +``` +"abc123","two" +"regex_value","string_value" +``` + +This would result in: + +``` +{ 'regex_field' => 'regex_value', 'two' => 'string_value' }, + rawtext: { 'regex_field' => 'regex_value', 'two' => 'string_value' } } +``` + +However, the below data: + +``` +"1234abc","two" +"regex_value,string_value" +``` + +would result in a RuntimeError: 'Header is not valid! unexpected: ["1234abc"]' diff --git a/docs/yaml-mapping-user-guide.md b/docs/yaml-mapping-user-guide.md index ccf5d10..86b76ae 100644 --- a/docs/yaml-mapping-user-guide.md +++ b/docs/yaml-mapping-user-guide.md @@ -13,4 +13,6 @@ add_to_nav: true 6. [Non Tabular Mappings](non-tabular-mappings.md) 7. [Date Formats](date-formats.md) 8. [XML mappings](xml-mappings.md) -9. [Zipped Field Mapping](priority-field-mapping.md) \ No newline at end of file +9. [Zipped Field Mapping](zipped-field-mapping.md) +10. [Regular Expression Column Names](regexp-column-names.md) +11. [Capturing Column Names in Mapped Data](capturing-column-names.md) \ No newline at end of file From c63711338e7838efc8101c1bf71504567772a94d Mon Sep 17 00:00:00 2001 From: ollietulloch Date: Thu, 28 Nov 2024 20:03:38 +0000 Subject: [PATCH 06/10] Changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7ae8ca1..dcc5546 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,8 @@ ## [Unreleased] ### Added * Column zipping functionality * +* Capturing Column name * +* Regular expression column names * ## 11.2.1 / 2024-11-18 ### Fixed From 877ab4037ed37a79945a43c03bb2e5eda540253b Mon Sep 17 00:00:00 2001 From: ollietulloch Date: Thu, 28 Nov 2024 20:17:44 +0000 Subject: [PATCH 07/10] Add a slightly convoluted test --- test/vcf/table_test.rb | 38 ++++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/test/vcf/table_test.rb b/test/vcf/table_test.rb index a8c32d8..6b0aac8 100644 --- a/test/vcf/table_test.rb +++ b/test/vcf/table_test.rb @@ -20,18 +20,28 @@ def setup test 'should transform avro lines' do table = NdrImport::Vcf::Table.new(klass: 'SomeTestKlass', columns: vcf_column_mapping) - expected_data = ['SomeTestKlass', { rawtext: { - '#chrom' => '1', - 'pos' => '26387783', - 'id' => '.', - 'ref' => 'G', - 'alt' => 'A', - 'qual' => '847.77', - 'filter' => 'PASS', - 'info' => 'AC=1;AF=0.500;AN=2;DP=85;set=Intersection', - 'format' => 'GT:AD:DP:GQ:PL:SAC', - 'sample1' => '0/1:52,32:84:99:876,0,1277:21,31,14,18' - } }, 1] + expected_data = ['SomeTestKlass', + { 'zipped_field' => + [%w[GT 0/1], + %w[AD 52,32], + %w[DP 84], + %w[GQ 99], + %w[PL 876,0,1277], + %w[SAC 21,31,14,18]], + 'lab_number' => 'Sample1', + rawtext: + { '#chrom' => '1', + 'pos' => '26387783', + 'id' => '.', + 'ref' => 'G', + 'alt' => 'A', + 'qual' => '847.77', + 'filter' => 'PASS', + 'info' => 'AC=1;AF=0.500;AN=2;DP=85;set=Intersection', + 'format' => 'GT:AD:DP:GQ:PL:SAC', + 'sample1' => '0/1:52,32:84:99:876,0,1277:21,31,14,18', + 'lab_number' => 'Sample1' } }, + 1] transformed_data = table.transform(@rows) assert_equal 6, transformed_data.count @@ -65,8 +75,8 @@ def vcf_column_mapping { 'column' => 'qual' }, { 'column' => 'filter' }, { 'column' => 'info' }, - { 'column' => 'format' }, - { 'column' => 'sample1' }] + { 'column' => 'format', 'mappings' => ['field' => 'zipped_field', 'zip_order' => 1, 'split_char' => /[:;]/] }, + { 'column' => /sample\d+/i, 'map_columname_to' => 'lab_number', 'mappings' => ['field' => 'zipped_field', 'zip_order' => 2] }] end def unexpected_columns_mapping From 15b7b17e1605d4c62a0318ae72c18805ef8d9bbc Mon Sep 17 00:00:00 2001 From: ollietulloch Date: Thu, 28 Nov 2024 20:35:31 +0000 Subject: [PATCH 08/10] Mutate regex column names while iterating --- lib/ndr_import/table.rb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/ndr_import/table.rb b/lib/ndr_import/table.rb index cea5acc..0eed1e4 100644 --- a/lib/ndr_import/table.rb +++ b/lib/ndr_import/table.rb @@ -82,10 +82,10 @@ def process_line(line, &block) # Update 'column' values expressed as a regular expression def mutate_regexp_columns(line) - @columns.each_with_index do |col, index| - next unless col['column'].is_a? Regexp + @columns.each_with_index do |column, index| + next unless column['column'].is_a? Regexp - @columns[index]['column'] = line[index] if @columns[index]['column'] =~ line[index] + column['column'] = line[index] if column['column'] =~ line[index] end end From 8754a17dddd2730c24748a3a7de7daf5649fd145 Mon Sep 17 00:00:00 2001 From: ollietulloch Date: Fri, 29 Nov 2024 09:45:28 +0000 Subject: [PATCH 09/10] Use `match?` instead of `=~` as a conditional --- lib/ndr_import/table.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/ndr_import/table.rb b/lib/ndr_import/table.rb index 0eed1e4..27711d3 100644 --- a/lib/ndr_import/table.rb +++ b/lib/ndr_import/table.rb @@ -85,7 +85,7 @@ def mutate_regexp_columns(line) @columns.each_with_index do |column, index| next unless column['column'].is_a? Regexp - column['column'] = line[index] if column['column'] =~ line[index] + column['column'] = line[index] if line[index].match? column['column'] end end From ebfffe9ac1aac88127bb41c37d2435e54c2da4e0 Mon Sep 17 00:00:00 2001 From: ollietulloch Date: Tue, 3 Dec 2024 11:40:23 +0000 Subject: [PATCH 10/10] typos --- test/table_test.rb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/table_test.rb b/test/table_test.rb index 3000bc5..779fb43 100644 --- a/test/table_test.rb +++ b/test/table_test.rb @@ -572,7 +572,7 @@ def test_wrong_header_names test 'should mutate regexp column names' do lines = [ %w[1234 STRING_HEADING ABC123], - %w[NUMRIC_ONLY STRING_VALUE ALPHA_NUMBERIC] + %w[NUMERIC_ONLY STRING_VALUE ALPHA_NUMERIC] ].each table = NdrImport::Table.new( @@ -584,7 +584,7 @@ def test_wrong_header_names expected_output = [ ['SomeTestKlass', - { rawtext: { '1234' => 'NUMRIC_ONLY', 'string_heading' => 'STRING_VALUE', 'abc123' => 'ALPHA_NUMBERIC' } }, + { rawtext: { '1234' => 'NUMERIC_ONLY', 'string_heading' => 'STRING_VALUE', 'abc123' => 'ALPHA_NUMERIC' } }, 1] ] assert_equal expected_output, table.transform(lines).to_a @@ -593,7 +593,7 @@ def test_wrong_header_names test 'should report header errors is regexp column names do not match' do lines = [ %w[A1234Z STRING_HEADING ABC123], - %w[NUMRIC_ONLY STRING_VALUE ALPHA_NUMBERIC] + %w[NUMERIC_ONLY STRING_VALUE ALPHA_NUMERIC] ].each table = NdrImport::Table.new(