Skip to content

readVcf doesn't handle quoted = in hapmap_exome_chr22.vcf.gz correctly #80

@LTLA

Description

@LTLA

The header line for GATKCommandLine causes problems:

##GATKCommandLine=<ID=ApplyRecalibration,Version=3.3-0-g37228af,Date="Thu May 07 02:55:12 EDT 2015",Epoch=1430981712637,CommandLineOptions="analysis_type=ApplyRecalibration input_file=[] showFullBamList=false read_buffer_size=null phone_home
=NO_ET gatk_key=/isilon/sequencing/CIDRSeqSuiteSoftware/gatk/GATK_2/lee.watkins_jhmi.edu.key tag=NA read_filter=[] intervals=null excludeIntervals=null interval_set_rule=UNION interval_merging=ALL interval_padding=0 reference_sequence=/isilo
n/sequencing/GATK_resource_bundle/1.5/b37/human_g1k_v37_decoy.fasta nonDeterministicRandomSeed=false disableDithering=false maxRuntime=-1 maxRuntimeUnits=MINUTES downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=
1000 baq=OFF baqGapOpenPenalty=40.0 refactor_NDN_cigar_string=false fix_misencoded_quality_scores=false allow_potentially_misencoded_quality_scores=false useOriginalQualities=false defaultBaseQualities=-1 performanceLog=null BQSR=null quanti
ze_quals=0 disable_indel_quals=false emit_original_quals=false preserve_qscores_less_than=6 globalQScorePrior=-1.0 validation_strictness=SILENT remove_program_records=false keep_program_records=false sample_rename_mapping_file=null unsafe=nu
ll disable_auto_index_creation_and_locking_when_reading_rods=true no_cmdline_in_header=false sites_only=false never_trim_vcf_format_field=false bcf=false bam_compression=null simplifyBAM=false disable_bam_indexing=false generate_md5=false nu
m_threads=1 num_cpu_threads_per_data_thread=1 num_io_threads=0 monitorThreadEfficiency=false num_bam_file_handles=null read_group_black_list=null pedigree=[] pedigreeString=[] pedigreeValidationType=STRICT allow_intervals_with_unindexed_bam=
false generateShadowBCF=false variant_index_type=DYNAMIC_SEEK variant_index_parameter=-1 logging_level=INFO log_to_file=null help=false version=false input=[(RodBinding name=input source=/isilon/sequencing/Seq_Proj//Amos_LungCa_SeqWholeExome
Plus_040413_1/MULTI_SAMPLE/Amos_LungCa_SeqWholeExomePlus_040413_1.GATK-3.3-0.2050samples.raw.HC.INDEL.vcf)] recal_file=(RodBinding name=recal_file source=/isilon/sequencing/Seq_Proj//Amos_LungCa_SeqWholeExomePlus_040413_1/MULTI_SAMPLE/Amos_L
ungCa_SeqWholeExomePlus_040413_1.GATK-3.3-0.2050samples.HC.INDEL.recal) tranches_file=/isilon/sequencing/Seq_Proj/Amos_LungCa_SeqWholeExomePlus_040413_1/MULTI_SAMPLE/Amos_LungCa_SeqWholeExomePlus_040413_1.GATK-3.3-0.2050samples.HC.INDEL.tran
ches out=org.broadinstitute.gatk.engine.io.stubs.VariantContextWriterStub ts_filter_level=99.0 lodCutoff=null ignore_filter=null ignore_all_filters=false excludeFiltered=false mode=INDEL filter_reads_with_N_cigar=false filter_mismatching_bas
e_and_quals=false filter_bases_not_stored=false">

Calling readVcf gives me:

fl <- system.file("extdata", "hapmap_exome_chr22.vcf.gz", package="VariantAnnotation")
library(VariantAnnotation)
first <- readVcf(fl)
meta(header(first))$GATKCommandLine$CommandLineOptions
## [1] "\"analysis_type"

Oops, look like it just cut off after the = inside the quote. This causes no end of problems during a write round-trip:

writeVcf(first, out)
roundtrip <- readVcf(out, row.names=FALSE)
## [W::bcf_hdr_parse_line] Incomplete header line, trying to proceed anyway:
## 	[##GATKCommandLine=<ID=ApplyRecalibration,Version=3.3-0-g37228af,Date="Thu May 07 02:55:12 EDT 2015",Epoch=1430981712637,CommandLineOptions="analysis_type>
Session information
R Under development (unstable) (2023-11-29 r85646)
Platform: aarch64-apple-darwin22.5.0
Running under: macOS Ventura 13.6.1

Matrix products: default
BLAS:   /Users/luna/Software/R/trunk/lib/libRblas.dylib
LAPACK: /Users/luna/Software/R/trunk/lib/libRlapack.dylib;  LAPACK version 3.11.0

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: America/Los_Angeles
tzcode source: internal

attached base packages:
[1] stats4    stats     graphics  grDevices utils     datasets  methods
[8] base

other attached packages:
 [1] VariantAnnotation_1.49.2    Rsamtools_2.19.2
 [3] Biostrings_2.71.1           XVector_0.43.0
 [5] SummarizedExperiment_1.33.1 Biobase_2.63.0
 [7] GenomicRanges_1.55.1        GenomeInfoDb_1.39.5
 [9] IRanges_2.37.0              S4Vectors_0.41.3
[11] MatrixGenerics_1.15.0       matrixStats_1.2.0
[13] BiocGenerics_0.49.1

loaded via a namespace (and not attached):
 [1] KEGGREST_1.43.0          rjson_0.2.21             lattice_0.22-5
 [4] vctrs_0.6.5              tools_4.4.0              bitops_1.0-7
 [7] generics_0.1.3           curl_5.2.0               parallel_4.4.0
[10] tibble_3.2.1             fansi_1.0.6              AnnotationDbi_1.65.2
[13] RSQLite_2.3.4            blob_1.2.4               pkgconfig_2.0.3
[16] Matrix_1.6-4             BSgenome_1.71.1          dbplyr_2.4.0
[19] lifecycle_1.0.4          GenomeInfoDbData_1.2.11  compiler_4.4.0
[22] stringr_1.5.1            progress_1.2.3           codetools_0.2-19
[25] yaml_2.3.8               RCurl_1.98-1.13          pillar_1.9.0
[28] crayon_1.5.2             BiocParallel_1.37.0      DelayedArray_0.29.0
[31] cachem_1.0.8             abind_1.4-5              tidyselect_1.2.0
[34] digest_0.6.33            stringi_1.8.3            restfulr_0.0.15
[37] dplyr_1.1.4              biomaRt_2.59.0           fastmap_1.1.1
[40] grid_4.4.0               cli_3.6.2                SparseArray_1.3.2
[43] magrittr_2.0.3           S4Arrays_1.3.1           GenomicFeatures_1.55.1
[46] XML_3.99-0.16            utf8_1.2.4               rappdirs_0.3.3
[49] filelock_1.0.3           prettyunits_1.2.0        bit64_4.0.5
[52] httr_1.4.7               bit_4.0.5                png_0.1-8
[55] hms_1.1.3                memoise_2.0.1            BiocIO_1.13.0
[58] BiocFileCache_2.11.1     rtracklayer_1.63.0       rlang_1.1.2
[61] glue_1.6.2               DBI_1.2.0                xml2_1.3.6
[64] R6_2.5.1                 GenomicAlignments_1.39.0 zlibbioc_1.49.0

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions