@genebe/gnomad-exomes-depth:0.0.1-4.1.0

Depth of the GnomAD Exomes

Description

gnomad-exomes-depth 0.0.1-4.1.0

Description is empty.

Instructions

val file = "/tmp/gnomad.exomes.v4.0.coverage.summary.tsv.bz2"
val depth = spark.read.format("csv").option("header", "true").option("inferSchema", true).option("delimiter", "\t").option("nullValue",".").load(file)
val depthReady = depth.withColumn("chr",  regexp_replace(regexp_replace($"locus", ":.*", ""),"^chr","")).withColumn("pos",  regexp_replace($"locus", "^.*:", "")).drop("locus").select("chr","pos","median_approx")

import org.apache.spark.sql.functions._
import org.apache.spark.sql.{DataFrame, SparkSession}

def processDataFramePosition(df: DataFrame, outputDir: String): Unit = {
  // Define the converting function
  def toSpdi(chrInput: String, posInput: Int): (String, Int) = {
    if (chrInput == null || chrInput.isBlank || posInput < 0) {
      return (null, -1)
    }

    var seq = chrInput.stripPrefix("chr")
    if (seq == "MT") seq = "M" // Normalize for human genomes

    val pos = posInput - 1 // Convert to 0-based indexing
    (seq, pos)
  }

  // Create a UDF from the function
  val toSpdiUdf = udf((chrInput: String, posInput: Int) => {
    val result = toSpdi(chrInput, posInput)
    result
  })

  // Apply transformations
  val dfWithSpdi = df
    .withColumn("spdi", toSpdiUdf(col("chr"), col("pos")))
    .withColumn("_seq", col("spdi").getField("_1"))
    .withColumn("_pos", col("spdi").getField("_2").cast("int"))
    .drop("spdi")

  val finalDf = dfWithSpdi
    .drop("chr", "pos")
    .dropDuplicates("_seq", "_pos")

  // Write the output
  finalDf
    .repartition($"_seq")
    .sortWithinPartitions($"_seq", $"_pos")
    .write
    .option("compression", "zstd")
    .partitionBy("_seq")
    .parquet(outputDir)
}

processDataFramePosition(depthReady, "/tmp/depthExomes")

And then

annotation create-from-parquet --input /tmp/depthExomes --owner @genebe --name gnomad-exomes-depth --version 0.0.1-4.1.0 --species homo_sapiens --genome GRCh38 --title "Depth of the GnomAD Exomes" --database-type POSITION

Meta Information

Access:

PUBLIC

Author:

@genebe

Pull Command:

java -jar genebe.jar annotation pull --id @genebe/gnomad-exomes-depth:0.0.1-4.1.0more examples

Created:

20 Jan 2025, 08:27:08 UTC

Type:

POSITION

Genome:

GRCh38

Status:

ACTIVE

License:

NOT_SPECIFIED

Version: