diff --git a/CHANGELOG.md b/CHANGELOG.md index 7ae8ca1..dcc5546 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,8 @@ ## [Unreleased] ### Added * Column zipping functionality * +* Capturing Column name * +* Regular expression column names * ## 11.2.1 / 2024-11-18 ### Fixed diff --git a/docs/capturing-column-names.md b/docs/capturing-column-names.md new file mode 100644 index 0000000..1f40667 --- /dev/null +++ b/docs/capturing-column-names.md @@ -0,0 +1,37 @@ +--- +layout: page +title: Capturing Column Names in Mapped Data +permalink: /capturing-column-names/ +--- + +Column names themselves may contain data that should be included in each record. For example VCF files have a column name that is a Lab Number and it should be included on all records. + +In order to store the column name in each record, include the `map_columname_to` key at the column level, with the value being the desired field and rawtext name. + + +Example mapping + +--- + - column: column_one + mappings: + - field: field_one + - column: abc123 + map_columname_to: 'columnname_field' + mappings: + - field: field_two + +Example data: + +``` +"column_one","abc123" +"one","two" +``` + +This would result in: + +``` +{ 'field_one' => 'one', + 'columnname_field' => 'abc123', + 'field_two' => 'two', + rawtext: { 'column_one' => 'one', 'abc123' => 'two', 'columnname_field' => 'abc123' } } +``` diff --git a/docs/regexp-column-names.md b/docs/regexp-column-names.md new file mode 100644 index 0000000..a9b8e9a --- /dev/null +++ b/docs/regexp-column-names.md @@ -0,0 +1,44 @@ +--- +layout: page +title: Regular Expression Column Names +permalink: /regexp-column-names/ +--- + +Column names may differ between files, for example a lab number might be used a column name. In order to map the column, the column name can be a regular expression. + +If the regular expression matches the column name in the raw file, the data will be mapped and loaded as expected. + +If the regular expected does not match the column name, a column header error will be raised. + +Example mapping + +--- + - column: /\A[A-Z]+\d{3}\z/i + mappings: + - field: regex_field + - column: two + mappings: + - field: two + +Example data: + +``` +"abc123","two" +"regex_value","string_value" +``` + +This would result in: + +``` +{ 'regex_field' => 'regex_value', 'two' => 'string_value' }, + rawtext: { 'regex_field' => 'regex_value', 'two' => 'string_value' } } +``` + +However, the below data: + +``` +"1234abc","two" +"regex_value,string_value" +``` + +would result in a RuntimeError: 'Header is not valid! unexpected: ["1234abc"]' diff --git a/docs/yaml-mapping-user-guide.md b/docs/yaml-mapping-user-guide.md index ccf5d10..86b76ae 100644 --- a/docs/yaml-mapping-user-guide.md +++ b/docs/yaml-mapping-user-guide.md @@ -13,4 +13,6 @@ add_to_nav: true 6. [Non Tabular Mappings](non-tabular-mappings.md) 7. [Date Formats](date-formats.md) 8. [XML mappings](xml-mappings.md) -9. [Zipped Field Mapping](priority-field-mapping.md) \ No newline at end of file +9. [Zipped Field Mapping](zipped-field-mapping.md) +10. [Regular Expression Column Names](regexp-column-names.md) +11. [Capturing Column Names in Mapped Data](capturing-column-names.md) \ No newline at end of file diff --git a/lib/ndr_import/mapper.rb b/lib/ndr_import/mapper.rb index a2be28e..347c499 100644 --- a/lib/ndr_import/mapper.rb +++ b/lib/ndr_import/mapper.rb @@ -32,6 +32,7 @@ module Strings VALIDATES = 'validates'.freeze ZIP_ORDER = 'zip_order'.freeze SPLIT_CHAR = 'split_char'.freeze + MAP_COLUMNAME_TO = 'map_columname_to'.freeze end private @@ -120,6 +121,13 @@ def mapped_line(line, line_mappings) # Store the raw column value rawtext[rawtext_column_name] = raw_value + # If configured, store the column name in the given field + if column_mapping[Strings::MAP_COLUMNAME_TO].present? + data[column_mapping[Strings::MAP_COLUMNAME_TO]] ||= {} + data[column_mapping[Strings::MAP_COLUMNAME_TO]][:values] = [column_mapping['column']] + rawtext[column_mapping[Strings::MAP_COLUMNAME_TO]] = column_mapping['column'] + end + next unless column_mapping.key?(Strings::MAPPINGS) column_mapping[Strings::MAPPINGS].each do |field_mapping| diff --git a/lib/ndr_import/table.rb b/lib/ndr_import/table.rb index 2efd5dd..27711d3 100644 --- a/lib/ndr_import/table.rb +++ b/lib/ndr_import/table.rb @@ -66,6 +66,7 @@ def process_line(line, &block) return enum_for(:process_line, line) unless block if @row_index < header_lines + mutate_regexp_columns(line) consume_header_line(line, @columns) else transform_line(line, @row_index, &block) @@ -79,6 +80,15 @@ def process_line(line, &block) @notifier.try(:processed, @row_index) end + # Update 'column' values expressed as a regular expression + def mutate_regexp_columns(line) + @columns.each_with_index do |column, index| + next unless column['column'].is_a? Regexp + + column['column'] = line[index] if line[index].match? column['column'] + end + end + # This method transforms an incoming line of data by applying each of the klass masked # mappings to the line and yielding the klass and fields for each mapped klass. def transform_line(line, index) @@ -227,7 +237,7 @@ def header_message_for(missing, unexpected) # returns the column names as we expect to receive them def column_names(column_mappings) - column_mappings.map { |c| (c['column'] || c['standard_mapping']).downcase } + column_mappings.map { |c| (c['column'] || c['standard_mapping']).try(:downcase) } end # If specified in the mapping, stop transforming data at a given index (column) diff --git a/test/mapper_test.rb b/test/mapper_test.rb index edfec1d..9fb1fcf 100644 --- a/test/mapper_test.rb +++ b/test/mapper_test.rb @@ -335,6 +335,16 @@ def setup - field: field_two YML + map_colum_name_to_field_mapping = YAML.safe_load <<-YML + - column: column_one + mappings: + - field: field_one + - column: abc123 + map_columname_to: 'columnname_field' + mappings: + - field: field_two + YML + test 'map should return a number' do assert_equal '1', TestMapper.new.mapped_value('A', map_mapping) end @@ -728,4 +738,15 @@ def setup TestMapper.new.mapped_line(['A'], invalid_decode_mapping) end end + + test 'should map column name to field' do + mapped_line = TestMapper.new.mapped_line(%w[one two], map_colum_name_to_field_mapping) + expected_mapped_line = { + 'field_one' => 'one', + 'columnname_field' => 'abc123', + 'field_two' => 'two', + rawtext: { 'column_one' => 'one', 'abc123' => 'two', 'columnname_field' => 'abc123' } + } + assert_equal expected_mapped_line, mapped_line + end end diff --git a/test/table_test.rb b/test/table_test.rb index fd32813..779fb43 100644 --- a/test/table_test.rb +++ b/test/table_test.rb @@ -569,6 +569,44 @@ def test_wrong_header_names '["one", "three"] unexpected: ["fun", "tree"]', exception.message) end + test 'should mutate regexp column names' do + lines = [ + %w[1234 STRING_HEADING ABC123], + %w[NUMERIC_ONLY STRING_VALUE ALPHA_NUMERIC] + ].each + + table = NdrImport::Table.new( + header_lines: 1, + footer_lines: 0, + klass: 'SomeTestKlass', + columns: regexp_column_names + ) + + expected_output = [ + ['SomeTestKlass', + { rawtext: { '1234' => 'NUMERIC_ONLY', 'string_heading' => 'STRING_VALUE', 'abc123' => 'ALPHA_NUMERIC' } }, + 1] + ] + assert_equal expected_output, table.transform(lines).to_a + end + + test 'should report header errors is regexp column names do not match' do + lines = [ + %w[A1234Z STRING_HEADING ABC123], + %w[NUMERIC_ONLY STRING_VALUE ALPHA_NUMERIC] + ].each + + table = NdrImport::Table.new( + header_lines: 1, + footer_lines: 0, + klass: 'SomeTestKlass', + columns: regexp_column_names + ) + + exception = assert_raises(RuntimeError) { table.transform(lines).to_a } + assert_equal 'Header is not valid! unexpected: ["a1234z"]', exception.message + end + private def simple_deserialized_table @@ -620,6 +658,12 @@ def column_level_klass_masked_mappings } end + def regexp_column_names + [{ 'column' => /\A\d+\z/ }, + { 'column' => 'string_heading' }, + { 'column' => /\A[A-Z]+\d{3}\z/i }] + end + def get_yaml_mapping_order(yaml_mapping) yaml_mapping.split("\n"). delete_if { |line| /-+/.match(line) }. diff --git a/test/vcf/table_test.rb b/test/vcf/table_test.rb index a8c32d8..6b0aac8 100644 --- a/test/vcf/table_test.rb +++ b/test/vcf/table_test.rb @@ -20,18 +20,28 @@ def setup test 'should transform avro lines' do table = NdrImport::Vcf::Table.new(klass: 'SomeTestKlass', columns: vcf_column_mapping) - expected_data = ['SomeTestKlass', { rawtext: { - '#chrom' => '1', - 'pos' => '26387783', - 'id' => '.', - 'ref' => 'G', - 'alt' => 'A', - 'qual' => '847.77', - 'filter' => 'PASS', - 'info' => 'AC=1;AF=0.500;AN=2;DP=85;set=Intersection', - 'format' => 'GT:AD:DP:GQ:PL:SAC', - 'sample1' => '0/1:52,32:84:99:876,0,1277:21,31,14,18' - } }, 1] + expected_data = ['SomeTestKlass', + { 'zipped_field' => + [%w[GT 0/1], + %w[AD 52,32], + %w[DP 84], + %w[GQ 99], + %w[PL 876,0,1277], + %w[SAC 21,31,14,18]], + 'lab_number' => 'Sample1', + rawtext: + { '#chrom' => '1', + 'pos' => '26387783', + 'id' => '.', + 'ref' => 'G', + 'alt' => 'A', + 'qual' => '847.77', + 'filter' => 'PASS', + 'info' => 'AC=1;AF=0.500;AN=2;DP=85;set=Intersection', + 'format' => 'GT:AD:DP:GQ:PL:SAC', + 'sample1' => '0/1:52,32:84:99:876,0,1277:21,31,14,18', + 'lab_number' => 'Sample1' } }, + 1] transformed_data = table.transform(@rows) assert_equal 6, transformed_data.count @@ -65,8 +75,8 @@ def vcf_column_mapping { 'column' => 'qual' }, { 'column' => 'filter' }, { 'column' => 'info' }, - { 'column' => 'format' }, - { 'column' => 'sample1' }] + { 'column' => 'format', 'mappings' => ['field' => 'zipped_field', 'zip_order' => 1, 'split_char' => /[:;]/] }, + { 'column' => /sample\d+/i, 'map_columname_to' => 'lab_number', 'mappings' => ['field' => 'zipped_field', 'zip_order' => 2] }] end def unexpected_columns_mapping