Page MenuHomec4science

createTables.go
No OneTemporary

File Metadata

Created
Sat, May 18, 18:44

createTables.go

package main
import (
"log"
"os"
"bufio"
"strings"
"encoding/json"
"io/ioutil"
//"gopkg.in/dedis/crypto.v0/abstract"
)
// NOTE: the name of the created .csv file is the same name that should be given to the postgres tables.
// ----------------------
// SET THESE BEFORE RUNNING THE MAIN
// set the path of the SHRINE_ONT_GENOMIC_ANNOTATIONS.csv file
const fromFile = "/home/niccolo/Downloads/new_tables/SHRINE_ONT_GENOMIC_ANNOTATIONS.csv"
// set the path in which we will store the new tables and files
const topath = "/home/niccolo/Downloads/new_tables/tables/"
// ----------------------
// make sure you have the names of the following needed fields
//const chromosome string = "Chromosome"
//const start_pos string = "Start_Position"
//const ref_allele string = "Reference_Allele"
//const tum_allele1 string = "Tumor_Seq_Allele1"
//const tum_allele2 string = "Tumor_Seq_Allele2"
//const end_position string = "End_Position"
//const hugo_symbol string = "Hugo_Symbol"
const chromosome string = "Chr"
const start_pos string = "Start Position"
const ref_allele string = "Reference Allele"
const tum_allele1 string = "Tumor Seq Allele1"
const tum_allele2 string = "Tumor Seq Allele2"
const end_position string = "End Position"
const hugo_symbol string = "Hugo Gene Symbol"
// ----------------------
// to request variant ids with popups
const toOntGenomicNew = "genomic_annotations.csv"
// to have autocompletion in gene name popup
const toGenes = "gene_values.csv"
// to store the json map annotation name -> list of possible values (this is not a database table)
const toAnnotationToValueMap = "annotationToValue.json"
// ----------------------
func main(){
// if it does not already exists, create the path in which we will store all the created tables
if _, err := os.Stat(topath); os.IsNotExist(err) {
os.Mkdir(topath, os.ModePerm)
}
// ---
// create and store the map (in a .json file) from the annotation name to the list of possible values to avoid
// recomputing it if needed again
storeAnnotationToValuesMap(createAnnotationToValuesMap())
// ---
// create the both the genomic_annotations and gene_values tables and store them in .csv
loadAndCreate()
// ---
// create one table per annotation name with all its possible values and one table (annotation_names) with all
// the annotation names. These tables are needed for the popups autocompletion.
annotationToValues := loadAnnotationToValuesMap()
//println("loaded")
createAnnotationsTables(annotationToValues)
//println("tables created")
createAnnotationsNamesTable(annotationToValues)
// ---
// create the annotation-meta file (not needed anymore)
//annotationToValues := loadAnnotationToValuesMap()
//createAnnotationMetadataFile(annotationToValues)
}
func loadAndCreate() error{
// open all the files, I'll write in them as I read to avoid to load all the csv in go (just one line per time)
from, err := os.Open(fromFile)
defer from.Close()
if err != nil {
log.Fatal("Error while opening", fromFile)
return nil
}
ontGenomicFile, err := os.Create(topath + toOntGenomicNew)
defer ontGenomicFile.Close()
if err != nil {
log.Fatal("Error while opening", topath + toOntGenomicNew)
return err
}
genesFile, err := os.Create(topath + toGenes)
defer genesFile.Close()
if err != nil {
log.Fatal("Error while opening", topath + toGenes)
return err
}
// replacers for strange annotation values. repl2(repl1(str))
repl1 := strings.NewReplacer(
",;", " ",
";,", " ",
";;", " ",
",,", " ",
)
repl2 := strings.NewReplacer(
",", " ",
";", " ",
)
// just keep the list of Hugo_symbol values already written in the file (I don't want to repeat values)
var hugoValues = []string{"?"} // ? so that if some hugo symbol is missing in a row I don't write "?" in genesFile
var annotationToValue = make(map[string]string)
// scanner.Text() returns "-8772697417129649152","{""Hugo_Symbol"":""PI4KB"", ""Entrez_Gene_Id"":""0"", ""Center"":""broad.mit.edu"", ""NCBI_Build"":""GRCh37"", ""Chromosome"":""1"", ""Start_Position"":""151287991"", ""End_Position"":""151287991"", ""Strand"":""-1"", ""Variant_Classification"":""Intron"", ""Variant_Type"":""SNP"", ""Reference_Allele"":""T"", ""Tumor_Seq_Allele1"":""C"", ""Tumor_Seq_Allele2"":""C"", ""dbSNP_RS"":"""", ""dbSNP_Val_Status"":"""", ""Tumor_Sample_Barcode"":""ME002"", ""Matched_Norm_Sample_Barcode"":""ME002-Normal"", ""Match_Norm_Seq_Allele1"":""T"", ""Match_Norm_Seq_Allele2"":""T"", ""Tumor_Validation_Allele1"":"""", ""Tumor_Validation_Allele2"":"""", ""Match_Norm_Validation_Allele1"":"""", ""Match_Norm_Validation_Allele2"":"""", ""Verification_Status"":"""", ""Validation_Status"":"""", ""Mutation_Status"":"""", ""Sequencing_Phase"":"""", ""Sequence_Source"":"""", ""Validation_Method"":"""", ""Score"":"""", ""BAM_File"":"""", ""Sequencer"":""Illumina GAIIx"", ""MA:FImpact"":"""", ""MA:FIS"":"""", ""MA:protein.change"":"""", ""MA:link.MSA"":"""", ""MA:link.PDB"":"""", ""MA:link.var"":"""", ""Tumor_Sample_UUID"":"""", ""Matched_Norm_Sample_UUID"":"""", ""HGVSc"":""c.909+58A>G"", ""HGVSp"":"""", ""HGVSp_Short"":"""", ""Transcript_ID"":""ENST00000368873"", ""Exon_Number"":"""", ""t_depth"":""0"", ""t_ref_count"":""."", ""t_alt_count"":""."", ""n_depth"":""0"", ""n_ref_count"":""."", ""n_alt_count"":""."", ""all_effects"":""PI4KB,intron_variant,,ENST00000368875,NM_002651.2;PI4KB,intron_variant,,ENST00000368874,NM_001198774.1;PI4KB,intron_variant,,ENST00000271657,;PI4KB,intron_variant,,ENST00000368873,;PI4KB,intron_variant,,ENST00000368872,NM_001198773.1;PI4KB,intron_variant,,ENST00000529142,NM_001198775.1;PI4KB,intron_variant,,ENST00000430800,;PI4KB,intron_variant,,ENST00000489223,;PI4KB,downstream_gene_variant,,ENST00000438243,;"", ""Allele"":""C"", ""Gene"":""ENSG00000143393"", ""Feature"":""ENST00000368873"", ""Feature_type"":""Transcript"", ""Consequence"":""intron_variant"", ""cDNA_position"":""-/3340"", ""CDS_position"":""-/2451"", ""Protein_position"":""-/816"", ""Amino_acids"":"""", ""Codons"":"""", ""Existing_variation"":"""", ""ALLELE_NUM"":""1"", ""DISTANCE"":"""", ""SYMBOL"":""PI4KB"", ""SYMBOL_SOURCE"":""HGNC"", ""HGNC_ID"":""8984"", ""BIOTYPE"":""protein_coding"", ""CANONICAL"":"""", ""CCDS"":"""", ""ENSP"":""ENSP00000357867"", ""SWISSPROT"":""PI4KB_HUMAN"", ""TREMBL"":""F8W860_HUMAN,E9PL47_HUMAN,E9PIH4_HUMAN"", ""UNIPARC"":""UPI0000040634"", ""RefSeq"":"""", ""SIFT"":"""", ""PolyPhen"":"""", ""EXON"":"""", ""INTRON"":""02.nov"", ""DOMAINS"":"""", ""GMAF"":"""", ""AFR_MAF"":"""", ""AMR_MAF"":"""", ""ASN_MAF"":"""", ""EAS_MAF"":"""", ""EUR_MAF"":"""", ""SAS_MAF"":"""", ""AA_MAF"":"""", ""EA_MAF"":"""", ""CLIN_SIG"":"""", ""SOMATIC"":"""", ""PUBMED"":"""", ""MOTIF_NAME"":"""", ""MOTIF_POS"":"""", ""HIGH_INF_POS"":"""", ""MOTIF_SCORE_CHANGE"":"""", ""IMPACT"":""MODIFIER"", ""PICK"":"""", ""VARIANT_CLASS"":""SNV"", ""TSL"":"""", ""HGVS_OFFSET"":"""", ""PHENO"":"""", ""MINIMISED"":""1"", ""ExAC_AF"":"""", ""ExAC_AF_AFR"":"""", ""ExAC_AF_AMR"":"""", ""ExAC_AF_EAS"":"""", ""ExAC_AF_FIN"":"""", ""ExAC_AF_NFE"":"""", ""ExAC_AF_OTH"":"""", ""ExAC_AF_SAS"":"""", ""GENE_PHENO"":"""", ""FILTER"":"".""}"
scanner := bufio.NewScanner(from)
i := 0 // there are 232 211 lines to read
for scanner.Scan() {
i = i+1
if i % 10000 == 0{
println(i)
//return nil // todo uncomment this to test if you don't want to scan the whole file
}
var variantNumber string
var variantName string
// split the row in variant identifier and list of genomic annotations
row := strings.SplitN(scanner.Text(), ",", 2)
variantNumber = row[0][1:len(row[0])-1]
// divide the annotations (list of "annotationName"":""annotationValue")
annotationsList := strings.Split(row[1][4:len(row[1])-4], "\"\", \"\"")
// store the current row in map
annotationToValue = map[string]string{}
for _, annot := range annotationsList {
temp := strings.Split(annot, "\"\":\"\"")
annotationName := temp[0]
annotationValue := temp[1]
if annotationValue == "" { continue }
if annotationValue == "-" { continue }
if annotationValue == "Unknown" { continue }
// just check this line don't a duplicated annotation (shouldn't happen)
_, ok := annotationToValue[annotationName]
if ok {println("Error! Variant", variantNumber, "contains duplicated", annotationName)}
annotationToValue[annotationName] = repl2.Replace(repl1.Replace(annotationValue))
}
// once the whole line is loaded let's write it into the files
// first get the t_depth and build the variant name chr:pos:r>A
t_depth := "0" // todo if it not specified what value should I put?
// todo remove t_depth
d, ok := annotationToValue["t_depth"]
if ok{t_depth = d}
delete(annotationToValue, "t_depth")
// if there is no value then put "?"
chrom := "?"
if a, ok := annotationToValue[chromosome]; ok {chrom = a}
startPos := "?"
if a, ok := annotationToValue[start_pos]; ok {startPos = a}
refAllele := "?"
if a, ok := annotationToValue[ref_allele]; ok {refAllele = a}
// just get one of the two
allele := "?"
if a, ok := annotationToValue[tum_allele1]; ok {
allele = a
}else if a, ok := annotationToValue[tum_allele2]; ok {
allele = a
}
variantName =
chrom + ":" +
startPos + ":" +
refAllele + ">"+
allele
// we don't need these entries anymore
delete(annotationToValue, chromosome)
delete(annotationToValue, start_pos)
delete(annotationToValue, end_position)
delete(annotationToValue, ref_allele)
// GENOMIC ONTOLOGY
// first column: variant number
ontGenomicLine := "\"" + variantNumber + "\""
// second column: variant name
ontGenomicLine += ",\"" + variantName + "\""
// third column: Hugo_SymbolValue;(homozygous/heterozygous/missing);annotationName1=annotationValue1;annotationName2=annotationValue2;...
hugoValue := "?"
if a, ok := annotationToValue[hugo_symbol]; ok {
hugoValue = a
}else{
// todo lot have no Hugo_Symbol field
// println("Error! Variant", variantNumber, "contains no HugoSymbol")
}
// tum_allele1 tum_allele2
// nil nil Unknown
// A nil Unknown
// nil B Unknown
// A B Heterozygous
// A A Homozygous
alleleType := ""
a1, ok1 := annotationToValue[tum_allele1]
a2, ok2 := annotationToValue[tum_allele2]
if !ok1 || !ok2{
alleleType = "Unknown"
}else if a1==a2{
alleleType = "Homozygous"
}else{
alleleType = "Heterozygous"
}
ontGenomicLine += ",\"" + hugoValue + ";" + alleleType
delete(annotationToValue, hugo_symbol)
delete(annotationToValue, tum_allele1)
delete(annotationToValue, tum_allele2)
for anName, anValue := range annotationToValue {
ontGenomicLine += ";" + anName + "=" + anValue
}
ontGenomicLine += "\""
// fourth column: t_depth
ontGenomicLine += ",\"" + t_depth + "\""
newline := "\n" // write the row in a new line
if i == 1 {
newline = ""
}
_, err = ontGenomicFile.WriteString(newline + ontGenomicLine)
if err != nil {
log.Fatal("Error in writing in the genomic file", err)
return err
}
// GENES
if !containsOrAdd(&hugoValues, hugoValue) {
_, err = genesFile.WriteString(newline + "\"" + hugoValue + "\"")
if err != nil {
log.Fatal("Error in writing in the genomic file", err)
return err
}
}
}
if err := scanner.Err(); err != nil {
log.Fatal(err)
return err
}
return nil
}
func createAnnotationToValuesMap() map[string]*[]string{
fp, err := os.Open(fromFile)
defer fp.Close()
if err != nil {
log.Fatal("Error while opening", fromFile)
return nil
}
var annotationToValues = make(map[string]*[]string)
// scanner.Text() returns "-8772697417129649152","{""Hugo_Symbol"":""PI4KB"", ""Entrez_Gene_Id"":""0"", ""Center"":""broad.mit.edu"", ""NCBI_Build"":""GRCh37"", ""Chromosome"":""1"", ""Start_Position"":""151287991"", ""End_Position"":""151287991"", ""Strand"":""-1"", ""Variant_Classification"":""Intron"", ""Variant_Type"":""SNP"", ""Reference_Allele"":""T"", ""Tumor_Seq_Allele1"":""C"", ""Tumor_Seq_Allele2"":""C"", ""dbSNP_RS"":"""", ""dbSNP_Val_Status"":"""", ""Tumor_Sample_Barcode"":""ME002"", ""Matched_Norm_Sample_Barcode"":""ME002-Normal"", ""Match_Norm_Seq_Allele1"":""T"", ""Match_Norm_Seq_Allele2"":""T"", ""Tumor_Validation_Allele1"":"""", ""Tumor_Validation_Allele2"":"""", ""Match_Norm_Validation_Allele1"":"""", ""Match_Norm_Validation_Allele2"":"""", ""Verification_Status"":"""", ""Validation_Status"":"""", ""Mutation_Status"":"""", ""Sequencing_Phase"":"""", ""Sequence_Source"":"""", ""Validation_Method"":"""", ""Score"":"""", ""BAM_File"":"""", ""Sequencer"":""Illumina GAIIx"", ""MA:FImpact"":"""", ""MA:FIS"":"""", ""MA:protein.change"":"""", ""MA:link.MSA"":"""", ""MA:link.PDB"":"""", ""MA:link.var"":"""", ""Tumor_Sample_UUID"":"""", ""Matched_Norm_Sample_UUID"":"""", ""HGVSc"":""c.909+58A>G"", ""HGVSp"":"""", ""HGVSp_Short"":"""", ""Transcript_ID"":""ENST00000368873"", ""Exon_Number"":"""", ""t_depth"":""0"", ""t_ref_count"":""."", ""t_alt_count"":""."", ""n_depth"":""0"", ""n_ref_count"":""."", ""n_alt_count"":""."", ""all_effects"":""PI4KB,intron_variant,,ENST00000368875,NM_002651.2;PI4KB,intron_variant,,ENST00000368874,NM_001198774.1;PI4KB,intron_variant,,ENST00000271657,;PI4KB,intron_variant,,ENST00000368873,;PI4KB,intron_variant,,ENST00000368872,NM_001198773.1;PI4KB,intron_variant,,ENST00000529142,NM_001198775.1;PI4KB,intron_variant,,ENST00000430800,;PI4KB,intron_variant,,ENST00000489223,;PI4KB,downstream_gene_variant,,ENST00000438243,;"", ""Allele"":""C"", ""Gene"":""ENSG00000143393"", ""Feature"":""ENST00000368873"", ""Feature_type"":""Transcript"", ""Consequence"":""intron_variant"", ""cDNA_position"":""-/3340"", ""CDS_position"":""-/2451"", ""Protein_position"":""-/816"", ""Amino_acids"":"""", ""Codons"":"""", ""Existing_variation"":"""", ""ALLELE_NUM"":""1"", ""DISTANCE"":"""", ""SYMBOL"":""PI4KB"", ""SYMBOL_SOURCE"":""HGNC"", ""HGNC_ID"":""8984"", ""BIOTYPE"":""protein_coding"", ""CANONICAL"":"""", ""CCDS"":"""", ""ENSP"":""ENSP00000357867"", ""SWISSPROT"":""PI4KB_HUMAN"", ""TREMBL"":""F8W860_HUMAN,E9PL47_HUMAN,E9PIH4_HUMAN"", ""UNIPARC"":""UPI0000040634"", ""RefSeq"":"""", ""SIFT"":"""", ""PolyPhen"":"""", ""EXON"":"""", ""INTRON"":""02.nov"", ""DOMAINS"":"""", ""GMAF"":"""", ""AFR_MAF"":"""", ""AMR_MAF"":"""", ""ASN_MAF"":"""", ""EAS_MAF"":"""", ""EUR_MAF"":"""", ""SAS_MAF"":"""", ""AA_MAF"":"""", ""EA_MAF"":"""", ""CLIN_SIG"":"""", ""SOMATIC"":"""", ""PUBMED"":"""", ""MOTIF_NAME"":"""", ""MOTIF_POS"":"""", ""HIGH_INF_POS"":"""", ""MOTIF_SCORE_CHANGE"":"""", ""IMPACT"":""MODIFIER"", ""PICK"":"""", ""VARIANT_CLASS"":""SNV"", ""TSL"":"""", ""HGVS_OFFSET"":"""", ""PHENO"":"""", ""MINIMISED"":""1"", ""ExAC_AF"":"""", ""ExAC_AF_AFR"":"""", ""ExAC_AF_AMR"":"""", ""ExAC_AF_EAS"":"""", ""ExAC_AF_FIN"":"""", ""ExAC_AF_NFE"":"""", ""ExAC_AF_OTH"":"""", ""ExAC_AF_SAS"":"""", ""GENE_PHENO"":"""", ""FILTER"":"".""}"
scanner := bufio.NewScanner(fp)
// todo remove i (get all rows)
i := 0 // there are 232 211 lines to read
for scanner.Scan() {
i = i+1
if i % 1000 == 0{
//println(i)
//return annotationToValues
}
// drop the mutation number
annotationsString := strings.SplitN(scanner.Text(), ",", 2)[1]
// divide the annotations (list of "annotationName"":""annotationValue")
annotationsList := strings.Split(annotationsString[4:len(annotationsString)-4], "\"\", \"\"")
for _, annot := range annotationsList {
temp := strings.Split(annot, "\"\":\"\"")
annotationName := temp[0]
annotationValue := temp[1]
if annotationValue == "" { continue }
if annotationValue == "-" { continue }
if annotationValue == "Unknown" { continue }
_, ok := annotationToValues[annotationName]
// it is the first time I see the annotations, then add to the map
if !ok {
annotationToValues[annotationName] = &[]string{}
}
// add the new value if not already in the list
containsOrAdd(annotationToValues[annotationName], annotationValue)
//// check if the value is already in the map
//alreadypresent := false
//for _, val := range annotationToValues[annotationName]{
// if val == annotationValue {
// alreadypresent = true
// break
// }
//}
//
//if !alreadypresent{
// annotationToValues[annotationName] = append(annotationToValues[annotationName], annotationValue)
//}
}
}
if err := scanner.Err(); err != nil {
log.Fatal(err)
}
return annotationToValues
}
func storeAnnotationToValuesMap(annotationToValues map[string]*[]string){
jsonString, err := json.Marshal(annotationToValues)
if err != nil {print("Error marshalling to json"); return}
annofile, err := os.Create(topath + toAnnotationToValueMap)
if err != nil {log.Fatal("Error while creating", topath + toAnnotationToValueMap); return}
defer annofile.Close()
_, err = annofile.Write(jsonString)
if err != nil {log.Fatal("Error in writing in the genomic file", err)}
}
func loadAnnotationToValuesMap() (annotationToValues map[string]*[]string){
annofile, err := os.Open(topath + toAnnotationToValueMap)
if err != nil { log.Fatal("Error while opening", topath + toAnnotationToValueMap); return }
defer annofile.Close()
//var bytes = make([]byte, 5)
bytes, err := ioutil.ReadAll(annofile)
//_, err = annofile.Read(bytes)
if err != nil {log.Fatal("Error while reading", topath + toAnnotationToValueMap, err); return}
err = json.Unmarshal(bytes, &annotationToValues)
if err != nil {println("Error unmarshalling from json:", err.Error()); return}
return
}
// given the map create one csv file per annotation storing its values
func createAnnotationsTables(annotationToValues map[string]*[]string){
var anno_path = topath + "annotation_tables/"
// create path if not existing
if _, err := os.Stat(anno_path); os.IsNotExist(err) {
os.Mkdir(anno_path, os.ModePerm)
}
for annotation, values := range annotationToValues {
annotation = strings.Replace(strings.ToLower(annotation), " ", "_", -1)
var anno_file = anno_path + annotation + ".csv"
annofile, err := os.Create(anno_file)
if err != nil {log.Fatal("Error while creating", anno_file); return}
defer annofile.Close()
valuesStr := "\"" + strings.Join(*values, "\"\n\"") + "\""
_, err = annofile.WriteString(valuesStr)
if err != nil {
log.Fatal("Error while writing file ", anno_file, err)
}
}
}
// given the map creates one csv file with all the annotation names
func createAnnotationsNamesTable(annotationToValues map[string]*[]string){
var anno_file = topath + "annotation_tables/annotation_names"
annofile, err := os.Create(anno_file)
if err != nil {log.Fatal("Error while creating", anno_file); return}
defer annofile.Close()
annotation_names := make([]string, 0, len(annotationToValues))
for name := range annotationToValues {
annotation_names = append(annotation_names, name)
}
valuesStr := "\"" + strings.Join(annotation_names, "\"\n\"") + "\""
_, err = annofile.WriteString(valuesStr)
if err != nil {
log.Fatal("Error while writing file ", anno_file, err)
}
}
func containsOrAdd(array *[]string, str string) bool {
for _, s := range *array {
if s == str {
return true
}
}
*array = append(*array, str)
return false
}

Event Timeline