dbsnp - The Single Nucleotide Polymorphism Database

dbsnp 0.0.1-156

Database mapping chromosomal positions in GRCh38 to dbSNP rs identifiers, version 156.

Instructions

Download current dbSNP dump, hg38, in VCF format from https://ftp.ncbi.nlm.nih.gov/snp/latest_release/VCF/GCF_000001405.40.gz
Normalize:

#!/bin/bash

GENOME=/data/hg38.fa

for i in GCF_000001405.40.gz; do
        pigz -cd $i \
        |  sed -e 's/^NC_0000\([0-9][0-9]\)\.[0-9]*/\1/' | sed -e 's/^0//' -e 's/^23/X/' -e 's/^24/Y/' -e 's/NC_012920\.[0-9]*/M/' | grep -v -e '^NW_' -e '^NT_'  \
        | sed -e 's!^\([0-9XYM][0-9]*\)\t!chr\1\t!' \
        | bcftools norm --multiallelics -any --check-ref s -f $GENOME - \
        | sed -e 's!^chr!!' \
        | pbgzip -c /dev/stdin > ${i%.gz}.normalized.vcf.gz; \
        tabix ${i%.vcf.gz}.normalized.vcf.gz;
done

Convert to parquet format, using https://github.com/bigdatagenomics/adam for fast VCF processing in Spark

// ~/software/adam.24/bin/adam-shell --driver-memory 20G

sc.hadoopConfiguration.setBoolean("org.bdgenomics.adam.converters.VariantContextConverter.NEST_ANN_IN_GENOTYPES", true);import org.bdgenomics.adam.ds.ADAMContext; val ac=new ADAMContext(sc)
val file = "/data/GCF_000001405.40.normalized.vcf.gz"

val d = ac.loadVcf(file).toVariants.toDF
import org.apache.spark.sql.types.IntegerType

// previous version with COMMON attribute
// d.withColumn("COMMON", $"annotation.attributes.COMMON".cast(IntegerType)).withColumn("RS", concat(lit("rs"), $"annotation.attributes.RS")).withColumn("chr", regexp_replace($"referenceName", "^chr", "")).withColumn("pos", $"start"+1).withColumnRenamed("referenceAllele", "ref").withColumnRenamed("alternateAllele","alt").select("chr", "pos","RS","COMMON","ref","alt").dropDuplicates("pos","chr","alt","ref").repartition($"chr").sortWithinPartitions($"chr", $"pos").write.partitionBy("chr").parquet("/tmp/dbsnp")
d.withColumn("RS", concat(lit("rs"), $"annotation.attributes.RS")).withColumn("chr", regexp_replace($"referenceName", "^chr", "")).withColumn("pos", $"start"+1).withColumnRenamed("referenceAllele", "ref").withColumnRenamed("alternateAllele","alt").select("chr", "pos","RS","ref","alt").dropDuplicates("pos","chr","alt","ref").na.fill("").repartition($"chr").sortWithinPartitions($"chr", $"pos").write.partitionBy("chr").parquet("/tmp/dbsnp")



// convert to partitioned db in genebe format
val df = spark.read.parquet("/tmp/dbsnp")

// define the converting UDF
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{DataFrame, Row}

def toSpdi(chrInput: String, posInput: Int, refInput: String, altInput: String): (String, Int, Int, String) = {
    if (chrInput == null || chrInput.isBlank || posInput < 0) {
        return (null, -1, -1, null)
    }

    var seq = chrInput.stripPrefix("chr")
    if (seq == "MT") seq = "M" // Normalize for human genomes

    var ref = refInput
    var alt = altInput
    val vcfPos = posInput

    // Remove leading common bases
    var leadingCommon = 0
    while (leadingCommon < ref.length && leadingCommon < alt.length &&
           ref.charAt(leadingCommon) == alt.charAt(leadingCommon)) {
        leadingCommon += 1
    }
    ref = ref.substring(leadingCommon)
    alt = alt.substring(leadingCommon)
    val pos = vcfPos - 1 + leadingCommon // Convert to 0-based

    // Remove trailing common bases
    var trailingCommon = 0
    while (ref.length > trailingCommon && alt.length > trailingCommon &&
           ref.charAt(ref.length - 1 - trailingCommon) == alt.charAt(alt.length - 1 - trailingCommon)) {
        trailingCommon += 1
    }
    ref = ref.substring(0, ref.length - trailingCommon)
    alt = alt.substring(0, alt.length - trailingCommon)

    // Compute del as the number of bases in the remaining ref
    val del = ref.length
    val ins = alt

    (seq, pos, del, ins)
}

val toSpdiUdf = udf((chrInput: String, posInput: Int, refInput: String, altInput: String) => {
    val result = toSpdi(chrInput, posInput, refInput, altInput)
    result // Return a tuple
})

// register UDF
val spdiUdf = toSpdiUdf(col("chr"), col("pos"), col("ref"), col("alt"))

// I assume existing dataframe name is df
val dfWithSpdi = { df
  .withColumn("spdi", spdiUdf)
  .withColumn("_seq", col("spdi").getField("_1"))
  .withColumn("_pos", col("spdi").getField("_2").cast("int"))
  .withColumn("_del", col("spdi").getField("_3").cast("int"))
  .withColumn("_ins", col("spdi").getField("_4"))
  .drop("spdi") }

// drop not needed columns, almost same data is in _seq, _pos, _ref, _alt
val finalDf = dfWithSpdi.drop("chr", "pos", "ref", "alt").dropDuplicates("_seq","_pos","_del","_ins")

// partitionBy _seq, order by _seq, pos, del, ins, write to parquet
finalDf.repartition($"_seq").sortWithinPartitions($"_seq", $"_pos", $"_del", $"_ins").write.option("compression", "zstd").partitionBy("_seq").parquet("/tmp/dbsnp-ready")

Import to local GeneBe repository

java -jar GeneBeClient.jar annotation create-from-parquet --input /tmp/dbsnp-ready3 --owner @genebe --name dbsnp --version 0.0.1-156 --species homo_sapiens --genome GRCh38 --title "The Single Nucleotide Polymorphism Database"

Push

annotation push --id @genebe/dbsnp:0.0.1-156 --public true

@genebe/dbsnp:0.0.1-156

Description

dbsnp 0.0.1-156

Instructions

Meta Information

Access:

Author:

Pull Command:

Created:

Type:

Genome:

Status:

License:

Version: