@genebe/gnomad-exomes-depth:0.0.1-4.1.0
Depth of the GnomAD Exomes
Description
gnomad-exomes-depth 0.0.1-4.1.0
Description is empty.
Instructions
val file = "/tmp/gnomad.exomes.v4.0.coverage.summary.tsv.bz2"
val depth = spark.read.format("csv").option("header", "true").option("inferSchema", true).option("delimiter", "\t").option("nullValue",".").load(file)
val depthReady = depth.withColumn("chr", regexp_replace(regexp_replace($"locus", ":.*", ""),"^chr","")).withColumn("pos", regexp_replace($"locus", "^.*:", "")).drop("locus").select("chr","pos","median_approx")
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{DataFrame, SparkSession}
def processDataFramePosition(df: DataFrame, outputDir: String): Unit = {
// Define the converting function
def toSpdi(chrInput: String, posInput: Int): (String, Int) = {
if (chrInput == null || chrInput.isBlank || posInput < 0) {
return (null, -1)
}
var seq = chrInput.stripPrefix("chr")
if (seq == "MT") seq = "M" // Normalize for human genomes
val pos = posInput - 1 // Convert to 0-based indexing
(seq, pos)
}
// Create a UDF from the function
val toSpdiUdf = udf((chrInput: String, posInput: Int) => {
val result = toSpdi(chrInput, posInput)
result
})
// Apply transformations
val dfWithSpdi = df
.withColumn("spdi", toSpdiUdf(col("chr"), col("pos")))
.withColumn("_seq", col("spdi").getField("_1"))
.withColumn("_pos", col("spdi").getField("_2").cast("int"))
.drop("spdi")
val finalDf = dfWithSpdi
.drop("chr", "pos")
.dropDuplicates("_seq", "_pos")
// Write the output
finalDf
.repartition($"_seq")
.sortWithinPartitions($"_seq", $"_pos")
.write
.option("compression", "zstd")
.partitionBy("_seq")
.parquet(outputDir)
}
processDataFramePosition(depthReady, "/tmp/depthExomes")
And then
annotation create-from-parquet --input /tmp/depthExomes --owner @genebe --name gnomad-exomes-depth --version 0.0.1-4.1.0 --species homo_sapiens --genome GRCh38 --title "Depth of the GnomAD Exomes" --database-type POSITION
Meta Information
Access:
PUBLIC
Author:
@genebePull Command:
java -jar genebe.jar annotation pull --id @genebe/gnomad-exomes-depth:0.0.1-4.1.0
more examplesCreated:
20 Jan 2025, 08:27:08 UTC
Type:
POSITION
Genome:
GRCh38
Status:
ACTIVE