Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F74219636
DatasetsManipulations.java
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Fri, Jul 26, 13:22
Size
28 KB
Mime Type
text/x-c
Expires
Sun, Jul 28, 13:22 (2 d)
Engine
blob
Format
Raw Data
Handle
19351037
Attached To
R4434 MedCo I2b2 Cell
DatasetsManipulations.java
View Options
package
ch.epfl.lca1.medco
;
import
ch.epfl.lca1.medco.unlynx.UnlynxEncrypt
;
import
ch.epfl.lca1.medco.util.MedCoUtil
;
import
com.opencsv.CSVReader
;
import
com.opencsv.CSVWriter
;
import
org.apache.log4j.Level
;
import
org.apache.log4j.Logger
;
import
org.postgresql.ds.PGSimpleDataSource
;
import
java.io.*
;
import
java.util.HashMap
;
import
java.util.HashSet
;
import
java.util.Map
;
import
java.util.Set
;
/**
* Created by misbach on 20.07.17.
*/
public
class
DatasetsManipulations
{
public
static
void
main
(
String
[]
args
)
throws
IOException
{
//splitting();
//generation();
getInfoUseCases
();
//generateClearGenomicDataset();
//timeEncrypt();
//generateHistogramsForLeakage();
}
public
static
void
generateHistogramsForLeakage
()
throws
IOException
{
// generate histograms
//todo: MedCoLoadingClient.loadSrv1Conf();
String
genomicFilePath
=
"/home/misbach/repositories/i2b2-core-server-medco/ch.epfl.lca1.medco/testfiles/datasets/full/skcm_broad/data_mutations_extended_skcm_broad.txt"
,
clinicalFilePath
=
"/home/misbach/repositories/i2b2-core-server-medco/ch.epfl.lca1.medco/testfiles/datasets/full/skcm_broad/data_clinical_skcm_broad.txt"
,
clearGenomicOutputPath1
=
"/home/misbach/repositories/i2b2-core-server-medco/ch.epfl.lca1.medco/testfiles/datasets/full/skcm_broad/histogram_nb_patients.txt"
,
clearGenomicOutputPath2
=
"/home/misbach/repositories/i2b2-core-server-medco/ch.epfl.lca1.medco/testfiles/datasets/full/skcm_broad/histogram_nb_facts.txt"
;
// maps sample id -> patient id
Map
<
String
,
String
>
samplePatientMap
=
new
HashMap
<>();
// read clinical: store patient id - sample ids mapping
{
CSVReader
clinicalReader
=
new
CSVReader
(
new
FileReader
(
clinicalFilePath
),
'\t'
,
'\u0000'
,
5
);
String
[]
clinicalHeader
=
clinicalReader
.
readNext
();
// headers idx clinical
int
sampleIdId
=
-
1
,
patientIdId
=
-
1
;
for
(
int
i
=
0
;
i
<
clinicalHeader
.
length
;
i
++)
{
if
(
clinicalHeader
[
i
].
trim
().
equals
(
"SAMPLE_ID"
))
{
sampleIdId
=
i
;
}
else
if
(
clinicalHeader
[
i
].
trim
().
equals
(
"PATIENT_ID"
))
{
patientIdId
=
i
;
}
}
// dataset store mapping
String
[]
clinicalEntry
;
while
((
clinicalEntry
=
clinicalReader
.
readNext
())
!=
null
)
{
try
{
samplePatientMap
.
put
(
clinicalEntry
[
sampleIdId
],
clinicalEntry
[
patientIdId
]);
}
catch
(
Throwable
e
)
{
System
.
err
.
println
(
"ignoring clinical entry ..."
);
}
}
}
// map variant id -> patients set
Map
<
Long
,
Set
<
String
>>
nbPatientsMap
=
new
HashMap
<>();
// map patient -> nb facts
Map
<
String
,
Integer
>
nbFacts
=
new
HashMap
<>();
{
CSVReader
genomicReader
=
new
CSVReader
(
new
FileReader
(
genomicFilePath
),
'\t'
,
'\u0000'
,
1
);
String
[]
genomicHeader
=
genomicReader
.
readNext
();
// headers idx genomic
int
sampleIdId
=
-
1
,
chromId
=
-
1
,
startPosId
=
-
1
,
altAlleleId
=
-
1
,
refAlleleId
=
-
1
;
for
(
int
i
=
0
;
i
<
genomicHeader
.
length
;
i
++)
{
if
(
genomicHeader
[
i
].
trim
().
equals
(
"Tumor_Sample_Barcode"
))
{
sampleIdId
=
i
;
}
else
if
(
genomicHeader
[
i
].
trim
().
equals
(
"Chromosome"
))
{
chromId
=
i
;
}
else
if
(
genomicHeader
[
i
].
trim
().
equals
(
"Start_Position"
))
{
startPosId
=
i
;
}
else
if
(
genomicHeader
[
i
].
trim
().
equals
(
"Reference_Allele"
))
{
refAlleleId
=
i
;
}
else
if
(
genomicHeader
[
i
].
trim
().
equals
(
"Tumor_Seq_Allele1"
))
{
altAlleleId
=
i
;
}
}
// dataset genomic reading + generating [sample_id, patient_id, variant_id]
String
[]
genomicEntry
;
while
((
genomicEntry
=
genomicReader
.
readNext
())
!=
null
)
{
try
{
// extract from entry
long
variantId
=
EncryptedIdentifiersManager
.
getVariantId
(
genomicEntry
[
chromId
].
trim
(),
Long
.
parseLong
(
genomicEntry
[
startPosId
].
trim
()),
genomicEntry
[
refAlleleId
].
trim
(),
genomicEntry
[
altAlleleId
].
trim
());
String
patientId
=
samplePatientMap
.
get
(
genomicEntry
[
sampleIdId
]);
// add patients to the set corresponding to variant
if
(!
nbPatientsMap
.
containsKey
(
variantId
))
{
nbPatientsMap
.
put
(
variantId
,
new
HashSet
<>());
}
nbPatientsMap
.
get
(
variantId
).
add
(
patientId
);
// update nb facts / patient
if
(!
nbFacts
.
containsKey
(
patientId
))
{
nbFacts
.
put
(
patientId
,
0
);
}
nbFacts
.
replace
(
patientId
,
nbFacts
.
get
(
patientId
)
+
1
);
}
catch
(
Throwable
e
)
{
System
.
err
.
println
(
"ignoring genomic entry ..."
);
}
}
}
CSVWriter
genomicWriter1
=
new
CSVWriter
(
new
FileWriter
(
clearGenomicOutputPath1
),
'\t'
,
'\u0000'
);
genomicWriter1
.
writeNext
(
new
String
[]{
"VARIANT_ID"
,
"NB_PATIENTS"
});
for
(
Map
.
Entry
<
Long
,
Set
<
String
>>
entry
:
nbPatientsMap
.
entrySet
())
{
genomicWriter1
.
writeNext
(
new
String
[]{
String
.
valueOf
(
entry
.
getKey
()),
String
.
valueOf
(
entry
.
getValue
().
size
())});
}
CSVWriter
genomicWriter2
=
new
CSVWriter
(
new
FileWriter
(
clearGenomicOutputPath2
),
'\t'
,
'\u0000'
);
genomicWriter2
.
writeNext
(
new
String
[]{
"PATIENT_ID"
,
"NB_GENOMIC_FACTS"
});
for
(
Map
.
Entry
<
String
,
Integer
>
entry
:
nbFacts
.
entrySet
())
{
genomicWriter2
.
writeNext
(
new
String
[]{
String
.
valueOf
(
entry
.
getKey
()),
String
.
valueOf
(
entry
.
getValue
())});
}
genomicWriter1
.
close
();
genomicWriter2
.
close
();
}
public
static
void
timeEncrypt
()
throws
IOException
{
Logger
.
getRootLogger
().
setLevel
(
Level
.
OFF
);
loadSrv1Conf
();
UnlynxEncrypt
encrypt
=
new
UnlynxEncrypt
();
//StopWatch.overall.start();
for
(
int
i
=
0
;
i
<
1000000
;
i
++)
{
String
hey
=
encrypt
.
encryptInt
(
i
);
if
(
i
%
250000
==
0
)
{
//long time = StopWatch.overall.getTotalTimeMillis();
//System.out.println(time);
}
if
(
i
%
10000
==
0
)
{
System
.
out
.
println
(
i
);
}
}
}
public
static
void
generateClearGenomicDataset
()
throws
IOException
{
// generate full dataset for clear i2b2 loading, then split using the existing things
loadSrv1Conf
();
String
genomicFilePath
=
"/home/misbach/repositories/i2b2-core-server-medco/ch.epfl.lca1.medco/testfiles/datasets/full/skcm_broad/data_mutations_extended_skcm_broad.txt"
,
clinicalFilePath
=
"/home/misbach/repositories/i2b2-core-server-medco/ch.epfl.lca1.medco/testfiles/datasets/full/skcm_broad/data_clinical_skcm_broad.txt"
,
clearGenomicOutputPath
=
"/home/misbach/repositories/i2b2-core-server-medco/ch.epfl.lca1.medco/testfiles/datasets/full/skcm_broad/data_mutations_extended_skcm_broad_clear_i2b2.txt"
;
// maps sample id -> patient id
Map
<
String
,
String
>
samplePatientMap
=
new
HashMap
<>();
// read clinical: store patient id - sample ids mapping
{
CSVReader
clinicalReader
=
new
CSVReader
(
new
FileReader
(
clinicalFilePath
),
'\t'
,
'\u0000'
,
5
);
String
[]
clinicalHeader
=
clinicalReader
.
readNext
();
// headers idx clinical
int
sampleIdId
=
-
1
,
patientIdId
=
-
1
;
for
(
int
i
=
0
;
i
<
clinicalHeader
.
length
;
i
++)
{
if
(
clinicalHeader
[
i
].
trim
().
equals
(
"SAMPLE_ID"
))
{
sampleIdId
=
i
;
}
else
if
(
clinicalHeader
[
i
].
trim
().
equals
(
"PATIENT_ID"
))
{
patientIdId
=
i
;
}
}
// dataset store mapping
String
[]
clinicalEntry
;
while
((
clinicalEntry
=
clinicalReader
.
readNext
())
!=
null
)
{
try
{
samplePatientMap
.
put
(
clinicalEntry
[
sampleIdId
],
clinicalEntry
[
patientIdId
]);
}
catch
(
Throwable
e
)
{
System
.
err
.
println
(
"ignoring clinical entry ..."
);
}
}
}
// read genomic and output dataset we want
{
CSVReader
genomicReader
=
new
CSVReader
(
new
FileReader
(
genomicFilePath
),
'\t'
,
'\u0000'
,
1
);
String
[]
genomicHeader
=
genomicReader
.
readNext
();
CSVWriter
genomicWriter
=
new
CSVWriter
(
new
FileWriter
(
clearGenomicOutputPath
),
'\t'
,
'\u0000'
);
genomicWriter
.
writeNext
(
new
String
[]{
"SAMPLE_ID"
,
"PATIENT_ID"
,
"VARIANT_ID"
});
// headers idx genomic
int
sampleIdId
=
-
1
,
chromId
=
-
1
,
startPosId
=
-
1
,
altAlleleId
=
-
1
,
refAlleleId
=
-
1
;
for
(
int
i
=
0
;
i
<
genomicHeader
.
length
;
i
++)
{
if
(
genomicHeader
[
i
].
trim
().
equals
(
"Tumor_Sample_Barcode"
))
{
sampleIdId
=
i
;
}
else
if
(
genomicHeader
[
i
].
trim
().
equals
(
"Chromosome"
))
{
chromId
=
i
;
}
else
if
(
genomicHeader
[
i
].
trim
().
equals
(
"Start_Position"
))
{
startPosId
=
i
;
}
else
if
(
genomicHeader
[
i
].
trim
().
equals
(
"Reference_Allele"
))
{
refAlleleId
=
i
;
}
else
if
(
genomicHeader
[
i
].
trim
().
equals
(
"Tumor_Seq_Allele1"
))
{
altAlleleId
=
i
;
}
}
// dataset genomic reading + generating [sample_id, patient_id, variant_id]
String
[]
genomicEntry
;
while
((
genomicEntry
=
genomicReader
.
readNext
())
!=
null
)
{
try
{
long
variantId
=
EncryptedIdentifiersManager
.
getVariantId
(
genomicEntry
[
chromId
].
trim
(),
Long
.
parseLong
(
genomicEntry
[
startPosId
].
trim
()),
genomicEntry
[
refAlleleId
].
trim
(),
genomicEntry
[
altAlleleId
].
trim
());
genomicWriter
.
writeNext
(
new
String
[]{
genomicEntry
[
sampleIdId
],
samplePatientMap
.
get
(
genomicEntry
[
sampleIdId
]),
variantId
+
""
});
}
catch
(
Throwable
e
)
{
System
.
err
.
println
(
"ignoring genomic entry ..."
);
}
}
}
}
public
static
void
getInfoUseCases
()
throws
IOException
{
// cancer type detailed = cutaneous melanoma
// primary tumor localization = skin
// annotation protein position = 600/766
// hugo symbol = braf
loadSrv1Conf
();
Logger
.
getRootLogger
().
setLevel
(
Level
.
DEBUG
);
String
genomicFilePath
=
"/home/misbach/repositories/i2b2-core-server-medco/ch.epfl.lca1.medco/testfiles/datasets/full/skcm_broad/data_mutations_extended_skcm_broad.txt"
;
Set
<
Long
>
clearQueryVariantsUseCase1
=
new
HashSet
<>(),
clearQueryVariantsUseCase2Braf
=
new
HashSet
<>(),
clearQueryVariantsUseCase2Others
=
new
HashSet
<>();
{
CSVReader
genomicReader
=
new
CSVReader
(
new
FileReader
(
genomicFilePath
),
'\t'
,
'\u0000'
,
1
);
String
[]
genomicHeader
=
genomicReader
.
readNext
();
// headers idx genomic
int
proteinPosId
=
-
1
,
hugoSymbolId
=
-
1
,
sampleIdId
=
-
1
,
chromId
=
-
1
,
startPosId
=
-
1
,
altAlleleId
=
-
1
,
refAlleleId
=
-
1
;
for
(
int
i
=
0
;
i
<
genomicHeader
.
length
;
i
++)
{
if
(
genomicHeader
[
i
].
trim
().
equals
(
"Hugo_Symbol"
))
{
hugoSymbolId
=
i
;
}
else
if
(
genomicHeader
[
i
].
trim
().
equals
(
"Protein_position"
))
{
proteinPosId
=
i
;
}
else
if
(
genomicHeader
[
i
].
trim
().
equals
(
"Tumor_Sample_Barcode"
))
{
sampleIdId
=
i
;
}
else
if
(
genomicHeader
[
i
].
trim
().
equals
(
"Chromosome"
))
{
chromId
=
i
;
}
else
if
(
genomicHeader
[
i
].
trim
().
equals
(
"Start_Position"
))
{
startPosId
=
i
;
}
else
if
(
genomicHeader
[
i
].
trim
().
equals
(
"Reference_Allele"
))
{
refAlleleId
=
i
;
}
else
if
(
genomicHeader
[
i
].
trim
().
equals
(
"Tumor_Seq_Allele1"
))
{
altAlleleId
=
i
;
}
}
// read dataset
String
[]
genomicEntry
;
while
((
genomicEntry
=
genomicReader
.
readNext
())
!=
null
)
{
try
{
// use case 1
if
(
genomicEntry
[
proteinPosId
].
trim
().
equals
(
"600/766"
)
&&
genomicEntry
[
hugoSymbolId
].
trim
().
equals
(
"BRAF"
))
{
long
id
=
EncryptedIdentifiersManager
.
getVariantId
(
genomicEntry
[
chromId
].
trim
(),
Long
.
parseLong
(
genomicEntry
[
startPosId
].
trim
()),
genomicEntry
[
refAlleleId
].
trim
(),
genomicEntry
[
altAlleleId
].
trim
());
clearQueryVariantsUseCase1
.
add
(
id
);
}
// use case 2 - braf
if
(
genomicEntry
[
hugoSymbolId
].
trim
().
equals
(
"BRAF"
))
{
long
id
=
EncryptedIdentifiersManager
.
getVariantId
(
genomicEntry
[
chromId
].
trim
(),
Long
.
parseLong
(
genomicEntry
[
startPosId
].
trim
()),
genomicEntry
[
refAlleleId
].
trim
(),
genomicEntry
[
altAlleleId
].
trim
());
clearQueryVariantsUseCase2Braf
.
add
(
id
);
}
// use case 2 - other mutations
if
(
genomicEntry
[
hugoSymbolId
].
trim
().
equals
(
"PTEN"
)
||
genomicEntry
[
hugoSymbolId
].
trim
().
equals
(
"CDKN2A"
)
||
genomicEntry
[
hugoSymbolId
].
trim
().
equals
(
"MAP2K2"
)
||
genomicEntry
[
hugoSymbolId
].
trim
().
equals
(
"MAP2K1"
))
{
long
id
=
EncryptedIdentifiersManager
.
getVariantId
(
genomicEntry
[
chromId
].
trim
(),
Long
.
parseLong
(
genomicEntry
[
startPosId
].
trim
()),
genomicEntry
[
refAlleleId
].
trim
(),
genomicEntry
[
altAlleleId
].
trim
());
clearQueryVariantsUseCase2Others
.
add
(
id
);
}
}
catch
(
Throwable
e
)
{
System
.
err
.
println
(
"ignoring genomic variant ..."
);
}
}
System
.
err
.
println
(
"nbs: "
+
clearQueryVariantsUseCase1
.
size
()
+
" / "
+
clearQueryVariantsUseCase2Braf
.
size
()
+
" / "
+
clearQueryVariantsUseCase2Others
.
size
());
}
UnlynxEncrypt
encrypt
=
new
UnlynxEncrypt
();
// gen queries encrypted
System
.
out
.
println
(
"--- use case 1 ---\n"
);
System
.
out
.
print
(
"MEDCO_ENC:"
+
encrypt
.
encryptInt
(
1
)
+
" AND "
);
// val
System
.
out
.
println
(
"MEDCO_ENC:"
+
encrypt
.
encryptInt
(
2
)
+
" AND "
);
// val
for
(
Long
encId
:
clearQueryVariantsUseCase1
)
{
System
.
out
.
println
(
"MEDCO_GEN:"
+
encrypt
.
encryptInt
(
encId
)
+
" OR "
);
}
System
.
out
.
println
(
"\n\n\n--- use case 2 ---\n"
);
System
.
out
.
print
(
" -- clinical reminder "
);
for
(
Long
encId
:
clearQueryVariantsUseCase2Braf
)
{
System
.
out
.
println
(
"\\\\SENSITIVE_TAGGED\\medco\\encrypted\\"
+
encrypt
.
encryptInt
(
encId
)
+
"\\ OR "
);
}
System
.
out
.
println
(
" AND "
);
for
(
Long
encId
:
clearQueryVariantsUseCase2Others
)
{
System
.
out
.
println
(
"\\\\SENSITIVE_TAGGED\\medco\\encrypted\\"
+
encrypt
.
encryptInt
(
encId
)
+
"\\ OR "
);
}
// gen queries clear
System
.
out
.
println
(
"\n\n\n--- use case 1 clear ---\n"
);
for
(
Long
id
:
clearQueryVariantsUseCase1
)
{
System
.
out
.
println
(
"\\\\CLINICAL_NON_SENSITIVE\\medco\\clinical\\nonsensitive\\VARIANT_ID\\"
+
id
+
"\\ OR "
);
}
System
.
out
.
println
(
"\n\n\n--- use case 2 clear ---\n"
);
for
(
Long
id
:
clearQueryVariantsUseCase2Braf
)
{
System
.
out
.
println
(
"\\\\CLINICAL_NON_SENSITIVE\\medco\\clinical\\nonsensitive\\VARIANT_ID\\"
+
id
+
"\\ OR "
);
}
System
.
out
.
println
(
" AND "
);
for
(
Long
id
:
clearQueryVariantsUseCase2Others
)
{
System
.
out
.
println
(
"\\\\CLINICAL_NON_SENSITIVE\\medco\\clinical\\nonsensitive\\VARIANT_ID\\"
+
id
+
"\\ OR "
);
}
}
private
static
void
generateLessData
(
int
factor
,
String
genomicInput
,
String
genomicOuput
)
throws
IOException
{
CSVWriter
genomicWriter
=
new
CSVWriter
(
new
FileWriter
(
genomicOuput
),
'\t'
,
'\u0000'
);
CSVReader
genomicReader
=
new
CSVReader
(
new
FileReader
(
genomicInput
),
'\t'
,
'\u0000'
,
0
);
String
[]
genomicHeader
=
genomicReader
.
readNext
();
genomicWriter
.
writeNext
(
genomicHeader
);
String
[]
genomicEntry
;
int
entriesCount
=
0
,
writtenEntriesCount
=
0
;
while
(
(
genomicEntry
=
genomicReader
.
readNext
())
!=
null
)
{
if
(
entriesCount
%
factor
==
0
)
{
genomicWriter
.
writeNext
(
genomicEntry
);
writtenEntriesCount
++;
}
entriesCount
++;
}
genomicWriter
.
close
();
System
.
out
.
println
(
"Less data factor: "
+
factor
+
", parsed "
+
entriesCount
+
" entries, written "
+
writtenEntriesCount
);
}
private
static
void
generateMoreData
(
int
factor
,
String
genomicInput
,
String
genomicOuput
)
throws
IOException
{
CSVWriter
genomicWriter
=
new
CSVWriter
(
new
FileWriter
(
genomicOuput
),
'\t'
,
'\u0000'
);
CSVReader
genomicReader
=
new
CSVReader
(
new
FileReader
(
genomicInput
),
'\t'
,
'\u0000'
,
0
);
String
[]
genomicHeader
=
genomicReader
.
readNext
();
genomicWriter
.
writeNext
(
genomicHeader
);
String
[]
genomicEntry
;
int
entriesCount
=
0
,
writtenEntriesCount
=
0
;
while
(
(
genomicEntry
=
genomicReader
.
readNext
())
!=
null
)
{
for
(
int
i
=
0
;
i
<
factor
;
i
++)
{
genomicWriter
.
writeNext
(
genomicEntry
);
writtenEntriesCount
++;
}
entriesCount
++;
}
genomicWriter
.
close
();
System
.
out
.
println
(
"More data factor: "
+
factor
+
", parsed "
+
entriesCount
+
" entries, written "
+
writtenEntriesCount
);
}
/*
from each part file: generate
- clinical_half_patients: remove half of the patients, assign rest of of data to others remaining
- clinical_double_patients: duplicate all patients
- genomic_half_patients,
- genomic_double_patients
data generation:
when less --> retain only 1 out of 2 or 1 out of 4
when more --> all records duplicated / quadrupled
--> clinical remains the same, only genomic touched (because nb clinical << nb genomic)
*/
public
static
void
generation
()
throws
IOException
{
String
genomicPart1
=
"/home/misbach/repositories/i2b2-core-server-medco/ch.epfl.lca1.medco/testfiles/datasets/full/skcm_broad/data_mutations_extended_skcm_broad_clear_i2b2_part1"
,
genomicPart2
=
"/home/misbach/repositories/i2b2-core-server-medco/ch.epfl.lca1.medco/testfiles/datasets/full/skcm_broad/data_mutations_extended_skcm_broad_clear_i2b2_part2"
,
genomicPart3
=
"/home/misbach/repositories/i2b2-core-server-medco/ch.epfl.lca1.medco/testfiles/datasets/full/skcm_broad/data_mutations_extended_skcm_broad_clear_i2b2_part3"
;
// half generation
generateLessData
(
2
,
genomicPart1
+
".txt"
,
genomicPart1
+
"_half.txt"
);
generateLessData
(
2
,
genomicPart2
+
".txt"
,
genomicPart2
+
"_half.txt"
);
generateLessData
(
2
,
genomicPart3
+
".txt"
,
genomicPart3
+
"_half.txt"
);
// quarter generation
generateLessData
(
4
,
genomicPart1
+
".txt"
,
genomicPart1
+
"_quarter.txt"
);
generateLessData
(
4
,
genomicPart2
+
".txt"
,
genomicPart2
+
"_quarter.txt"
);
generateLessData
(
4
,
genomicPart3
+
".txt"
,
genomicPart3
+
"_quarter.txt"
);
// double generation
generateMoreData
(
2
,
genomicPart1
+
".txt"
,
genomicPart1
+
"_double.txt"
);
generateMoreData
(
2
,
genomicPart2
+
".txt"
,
genomicPart2
+
"_double.txt"
);
generateMoreData
(
2
,
genomicPart3
+
".txt"
,
genomicPart3
+
"_double.txt"
);
// quadruple generation
generateMoreData
(
4
,
genomicPart1
+
".txt"
,
genomicPart1
+
"_quadruple.txt"
);
generateMoreData
(
4
,
genomicPart2
+
".txt"
,
genomicPart2
+
"_quadruple.txt"
);
generateMoreData
(
4
,
genomicPart3
+
".txt"
,
genomicPart3
+
"_quadruple.txt"
);
}
public
static
void
splitting
()
throws
IOException
{
String
clinicalFilePath
=
"/home/misbach/repositories/i2b2-core-server-medco/ch.epfl.lca1.medco/testfiles/datasets/full/skcm_broad/data_clinical_skcm_broad.txt"
,
genomicFilePath
=
"/home/misbach/repositories/i2b2-core-server-medco/ch.epfl.lca1.medco/testfiles/datasets/full/skcm_broad/data_mutations_extended_skcm_broad.txt"
;
FileOutputStream
os11
=
new
FileOutputStream
(
"/home/misbach/repositories/i2b2-core-server-medco/ch.epfl.lca1.medco/testfiles/datasets/full/skcm_broad/data_clinical_skcm_broad_clear_i2b2_part1_encodingOK.txt"
),
os12
=
new
FileOutputStream
(
"/home/misbach/repositories/i2b2-core-server-medco/ch.epfl.lca1.medco/testfiles/datasets/full/skcm_broad/data_clinical_skcm_broad_clear_i2b2_part2_encodingOK.txt"
),
os13
=
new
FileOutputStream
(
"/home/misbach/repositories/i2b2-core-server-medco/ch.epfl.lca1.medco/testfiles/datasets/full/skcm_broad/data_clinical_skcm_broad_clear_i2b2_part3_encodingOK.txt"
),
os21
=
new
FileOutputStream
(
"/home/misbach/repositories/i2b2-core-server-medco/ch.epfl.lca1.medco/testfiles/datasets/full/skcm_broad/data_mutations_extended_skcm_broad_clear_i2b2_part1_encodingOK.txt"
),
os22
=
new
FileOutputStream
(
"/home/misbach/repositories/i2b2-core-server-medco/ch.epfl.lca1.medco/testfiles/datasets/full/skcm_broad/data_mutations_extended_skcm_broad_clear_i2b2_part2_encodingOK.txt"
),
os23
=
new
FileOutputStream
(
"/home/misbach/repositories/i2b2-core-server-medco/ch.epfl.lca1.medco/testfiles/datasets/full/skcm_broad/data_mutations_extended_skcm_broad_clear_i2b2_part3_encodingOK.txt"
);
CSVWriter
[]
clinicalWriters
=
new
CSVWriter
[]{
new
CSVWriter
(
new
OutputStreamWriter
(
os11
,
"UTF-8"
),
'\t'
,
CSVWriter
.
DEFAULT_QUOTE_CHARACTER
,
CSVWriter
.
RFC4180_LINE_END
),
new
CSVWriter
(
new
OutputStreamWriter
(
os12
,
"UTF-8"
),
'\t'
,
CSVWriter
.
DEFAULT_QUOTE_CHARACTER
,
CSVWriter
.
RFC4180_LINE_END
),
new
CSVWriter
(
new
OutputStreamWriter
(
os13
,
"UTF-8"
),
'\t'
,
CSVWriter
.
DEFAULT_QUOTE_CHARACTER
,
CSVWriter
.
RFC4180_LINE_END
),
};
CSVWriter
[]
genomicWriters
=
new
CSVWriter
[]{
new
CSVWriter
(
new
OutputStreamWriter
(
os21
,
"UTF-8"
),
'\t'
,
CSVWriter
.
DEFAULT_QUOTE_CHARACTER
,
CSVWriter
.
RFC4180_LINE_END
),
new
CSVWriter
(
new
OutputStreamWriter
(
os22
,
"UTF-8"
),
'\t'
,
CSVWriter
.
DEFAULT_QUOTE_CHARACTER
,
CSVWriter
.
RFC4180_LINE_END
),
new
CSVWriter
(
new
OutputStreamWriter
(
os23
,
"UTF-8"
),
'\t'
,
CSVWriter
.
DEFAULT_QUOTE_CHARACTER
,
CSVWriter
.
RFC4180_LINE_END
),
};
CSVReader
clinicalReader
=
new
CSVReader
(
new
InputStreamReader
(
new
FileInputStream
(
clinicalFilePath
),
"ISO-8859-1"
),
'\t'
,
'\u0000'
,
5
);
String
[]
clinicalHeader
=
clinicalReader
.
readNext
();
clinicalWriters
[
0
].
writeNext
(
clinicalHeader
);
clinicalWriters
[
1
].
writeNext
(
clinicalHeader
);
clinicalWriters
[
2
].
writeNext
(
clinicalHeader
);
CSVReader
genomicReader
=
new
CSVReader
(
new
InputStreamReader
(
new
FileInputStream
(
genomicFilePath
),
"ISO-8859-1"
),
'\t'
,
'\u0000'
,
1
);
String
[]
genomicHeader
=
genomicReader
.
readNext
();
genomicWriters
[
0
].
writeNext
(
genomicHeader
);
genomicWriters
[
1
].
writeNext
(
genomicHeader
);
genomicWriters
[
2
].
writeNext
(
genomicHeader
);
Set
<
String
>[]
sampleIdsSets
=
new
HashSet
[]
{
new
HashSet
<>(),
new
HashSet
<>(),
new
HashSet
<>()
};
// read clinical file
String
[]
clinicalEntry
;
int
clinicalEntriesCount
=
0
;
while
(
(
clinicalEntry
=
clinicalReader
.
readNext
())
!=
null
)
{
clinicalEntriesCount
++;
if
(
clinicalEntriesCount
<=
40
)
{
clinicalWriters
[
0
].
writeNext
(
clinicalEntry
);
sampleIdsSets
[
0
].
add
(
clinicalEntry
[
0
].
trim
());
}
else
if
(
clinicalEntriesCount
>
40
&&
clinicalEntriesCount
<=
80
)
{
clinicalWriters
[
1
].
writeNext
(
clinicalEntry
);
sampleIdsSets
[
1
].
add
(
clinicalEntry
[
0
].
trim
());
}
else
{
clinicalWriters
[
2
].
writeNext
(
clinicalEntry
);
sampleIdsSets
[
2
].
add
(
clinicalEntry
[
0
].
trim
());
}
}
System
.
out
.
println
(
"Parsed "
+
clinicalEntriesCount
+
" entries, sizes: "
+
sampleIdsSets
[
0
].
size
()
+
" - "
+
sampleIdsSets
[
1
].
size
()
+
" - "
+
sampleIdsSets
[
2
].
size
());
// read genomic file
int
sampleIdIdx
=
15
;
//15 for skcm broad originial, 2 for generated clear i2b2 dataset
String
[]
genomicEntry
;
int
genomicCount
=
0
;
while
(
(
genomicEntry
=
genomicReader
.
readNext
())
!=
null
)
{
genomicCount
++;
if
(
sampleIdsSets
[
0
].
contains
(
genomicEntry
[
sampleIdIdx
].
trim
()))
{
genomicWriters
[
0
].
writeNext
(
genomicEntry
);
}
else
if
(
sampleIdsSets
[
1
].
contains
(
genomicEntry
[
sampleIdIdx
].
trim
()))
{
genomicWriters
[
1
].
writeNext
(
genomicEntry
);
}
else
if
(
sampleIdsSets
[
2
].
contains
(
genomicEntry
[
sampleIdIdx
].
trim
())){
genomicWriters
[
2
].
writeNext
(
genomicEntry
);
}
else
{
System
.
out
.
println
(
"Non existing sample in clinical: "
+
genomicEntry
[
sampleIdIdx
]);
}
}
System
.
out
.
println
(
"Parsed "
+
genomicCount
+
" entries"
);
clinicalWriters
[
0
].
close
();
clinicalWriters
[
1
].
close
();
clinicalWriters
[
2
].
close
();
genomicWriters
[
0
].
close
();
genomicWriters
[
1
].
close
();
genomicWriters
[
2
].
close
();
}
protected
static
void
loadMedCoConf
(
String
hostname
,
int
i2b2Port
,
int
psqlPort
,
String
unlynxEntryPoint
)
{
MedCoUtil
.
getTestInstance
().
setProperty
(
MedCoUtil
.
ONTCELL_WS_URL_PROPERTIES
,
"http://"
+
hostname
+
":"
+
i2b2Port
+
"/i2b2/services/OntologyService"
);
MedCoUtil
.
getTestInstance
().
setProperty
(
MedCoUtil
.
FRCELL_WS_URL_PROPERTIES
,
"http://"
+
hostname
+
":"
+
i2b2Port
+
"/i2b2/services/FRService"
);
MedCoUtil
.
getTestInstance
().
setProperty
(
MedCoUtil
.
CRCCELL_WS_URL_PROPERTIES
,
"http://"
+
hostname
+
":"
+
i2b2Port
+
"/i2b2/services/QueryToolService"
);
MedCoUtil
.
getTestInstance
().
setProperty
(
MedCoUtil
.
I2B2CELLS_WS_WAITTIME_PROPERTIES
,
"180000"
);
MedCoUtil
.
getTestInstance
().
setProperty
(
MedCoUtil
.
UNLYNX_BINARY_PATH_PROPERTIES
,
"i2b2"
);
// assumed in bin path
MedCoUtil
.
getTestInstance
().
setProperty
(
MedCoUtil
.
UNLYNX_GROUP_FILE_PATH_PROPERTIES
,
"/home/misbach/repositories/medco-deployment/configuration-profiles/exp-3nodes-icclusters/group.toml"
);
MedCoUtil
.
getTestInstance
().
setProperty
(
MedCoUtil
.
UNLYNX_DEBUG_LEVEL_PROPERTIES
,
"5"
);
MedCoUtil
.
getTestInstance
().
setProperty
(
MedCoUtil
.
UNLYNX_PROOFS_PROPERTIES
,
"0"
);
MedCoUtil
.
getTestInstance
().
setProperty
(
MedCoUtil
.
UNLYNX_ENTRY_POINT_IDX_PROPERTIES
,
unlynxEntryPoint
);
PGSimpleDataSource
ds
=
new
PGSimpleDataSource
();
ds
.
setServerName
(
hostname
);
ds
.
setDatabaseName
(
"medcodeployment"
);
ds
.
setPortNumber
(
psqlPort
);
ds
.
setUser
(
"postgres"
);
ds
.
setPassword
(
"prigen2017"
);
//ds.setCurrentSchema("medco_data");
MedCoUtil
.
getTestInstance
().
setDataSource
(
ds
);
}
protected
static
void
loadSrv1Conf
()
{
loadMedCoConf
(
"localhost"
,
8082
,
5434
,
"0"
);
}
protected
static
void
loadSrv3Conf
()
{
loadMedCoConf
(
"iccluster062.iccluster.epfl.ch"
,
8080
,
5432
,
"1"
);
}
protected
static
void
loadSrv5Conf
()
{
loadMedCoConf
(
"iccluster063.iccluster.epfl.ch"
,
8080
,
5432
,
"2"
);
}
}
Event Timeline
Log In to Comment