Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F81631554
load_files.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sat, Sep 7, 03:26
Size
7 KB
Mime Type
text/x-python
Expires
Mon, Sep 9, 03:26 (2 d)
Engine
blob
Format
Raw Data
Handle
20601006
Attached To
R11484 ADDI
load_files.py
View Options
import
os
import
json
from
abc
import
abstractmethod
class
Dataset
:
def
__init__
(
self
,
home_path
,
finetune_path
,
data_path
=
"./datasets/"
,
split
=
0.9
):
self
.
xtrain
=
None
self
.
ytrain
=
None
self
.
xeval
=
None
self
.
yeval
=
None
self
.
xtest
=
None
self
.
ytest
=
None
self
.
data_path
=
data_path
self
.
loc_data_path
=
None
self
.
finetune_path
=
finetune_path
self
.
home_path
=
home_path
self
.
split
=
split
self
.
keywords
=
[
"<input>"
,
"<answer>"
,
"<find>"
,
"<|endoftext|>"
]
# Private methods
# protected functions
def
_openJSON
(
self
,
dataset
,
end
=
""
,
ext
=
".json"
):
with
open
(
dataset
+
end
+
ext
,
'r'
)
as
fp
:
tmp
=
json
.
load
(
fp
)
return
tmp
def
_writeTXT
(
self
,
data
,
name
):
with
open
(
name
+
".txt"
,
'w'
)
as
f
:
f
.
write
(
data
)
def
_writeJSON
(
self
,
data
,
name
):
with
open
(
name
+
".json"
,
"w"
)
as
f
:
json
.
dump
(
data
,
f
)
@abstractmethod
def
_format_data
(
self
,
end
):
raise
NotImplementedError
(
"format_data has to be defined in each child class"
)
@abstractmethod
def
_save_data
(
self
,
x
,
y
):
raise
NotImplementedError
(
"format_data has to be defined in each child class"
)
# Public methods
def
save_data
(
self
,
x
,
y
,
dir
,
end
=
""
,
x_test
=
[],
y_test
=
[],
console
=
None
):
os
.
chdir
(
self
.
home_path
)
os
.
chdir
(
self
.
data_path
)
if
not
os
.
path
.
exists
(
dir
):
os
.
mkdir
(
dir
)
os
.
chdir
(
dir
)
x
,
y
=
self
.
_save_data
(
x
,
y
)
x_test
,
y_test
=
self
.
_save_data
(
x_test
,
y_test
)
print
(
"The train set has {} entries"
.
format
(
len
(
x
)))
self
.
_writeJSON
(
x
,
"x_train"
+
end
)
self
.
_writeJSON
(
y
,
"y_train"
+
end
)
print
(
"The test set has {} entries"
.
format
(
len
(
x_test
)))
self
.
_writeJSON
(
x_test
,
"x_test"
+
end
)
self
.
_writeJSON
(
y_test
,
"y_test"
+
end
)
if
console
is
not
None
:
self
.
_writeTXT
(
console
,
"console"
+
end
)
def
load_data
(
self
,
end
=
""
,
dir
=
None
,
testset
=
""
,
dir2
=
None
,
save
=
True
):
testset
=
str
(
testset
)
os
.
chdir
(
self
.
home_path
)
os
.
chdir
(
self
.
data_path
)
if
dir
is
not
None
:
os
.
chdir
(
"./"
+
dir
+
"/"
)
self
.
loc_data_path
=
os
.
getcwd
()
self
.
xtrain
=
self
.
_openJSON
(
'x_train'
,
end
=
end
)
self
.
ytrain
=
self
.
_openJSON
(
'y_train'
,
end
=
end
)
if
dir2
is
not
None
:
os
.
chdir
(
self
.
home_path
)
os
.
chdir
(
self
.
data_path
)
os
.
chdir
(
"./"
+
dir2
+
"/"
)
self
.
xtest
=
self
.
_openJSON
(
'x_test'
,
end
=
end
+
testset
)
self
.
ytest
=
self
.
_openJSON
(
'y_test'
,
end
=
end
+
testset
)
# split train in evaluation and train
idx_split
=
int
(
self
.
split
*
len
(
self
.
xtrain
))
self
.
xeval
=
self
.
xtrain
[
idx_split
:]
self
.
yeval
=
self
.
ytrain
[
idx_split
:]
self
.
xtrain
=
self
.
xtrain
[:
idx_split
]
self
.
ytrain
=
self
.
ytrain
[:
idx_split
]
if
save
:
os
.
chdir
(
self
.
finetune_path
)
self
.
_format_data
(
end
)
def
get_train
(
self
):
return
[
self
.
xtrain
,
self
.
ytrain
]
def
get_eval
(
self
):
return
[
self
.
xeval
,
self
.
yeval
]
def
get_test
(
self
):
return
[
self
.
xtest
,
self
.
ytest
]
@abstractmethod
def
print_example
(
self
,
set
):
raise
NotImplementedError
(
"format_data has to be defined in each child class"
)
class
DatasetBert
(
Dataset
):
def
__init__
(
self
,
home_path
,
data_path
=
"./datasets/"
,
split
=
0.9
):
os
.
chdir
(
home_path
)
os
.
chdir
(
"./transformers/examples/"
)
os
.
chdir
(
"./question-answering"
)
finetune_path
=
os
.
getcwd
()
super
()
.
__init__
(
home_path
,
finetune_path
,
data_path
,
split
)
# private functions
# protected functions
def
_save_data
(
self
,
x
,
y
):
for
i
,
tmp
in
enumerate
(
x
):
tmp
=
tmp
.
split
(
"---"
)
text
=
tmp
[
1
]
question
=
tmp
[
0
]
x
[
i
]
=
self
.
keywords
[
0
]
+
\
text
+
\
self
.
keywords
[
2
]
+
\
question
+
\
self
.
keywords
[
1
]
for
i
,
_
in
enumerate
(
y
):
y
[
i
]
+=
self
.
keywords
[
3
]
# + "\n"
return
x
,
y
def
_format_data
(
self
,
end
):
# extract the context, question and answer from the x and y set
def
get_qca
(
xdata
,
ydata
,
i_qca
):
i_len
=
len
(
self
.
keywords
[
0
])
a_len
=
len
(
self
.
keywords
[
1
])
f_len
=
len
(
self
.
keywords
[
2
])
e_len
=
len
(
self
.
keywords
[
3
])
f_start
=
xdata
[
i_qca
]
.
find
(
self
.
keywords
[
2
])
context
=
xdata
[
i_qca
][
i_len
:
f_start
]
question
=
xdata
[
i_qca
][
f_start
+
f_len
:
-
a_len
]
answer
=
ydata
[
i_qca
][:
-
e_len
]
return
context
,
question
,
answer
# generate a dictionary of questions
def
gen_qa_set
(
xdata
,
ydata
):
qa_set
=
{
"data"
:
{
"question"
:
[],
"context"
:
[],
"answers"
:
[]}}
for
i
,
_
in
enumerate
(
xdata
):
context
,
question
,
answer
=
get_qca
(
xdata
,
ydata
,
i
)
qa_set
[
"data"
][
"question"
]
.
append
(
question
)
qa_set
[
"data"
][
"context"
]
.
append
(
context
)
qa_set
[
"data"
][
"answers"
]
.
append
({
"answer_start"
:
[
context
.
find
(
answer
)],
"text"
:
[
answer
]})
return
qa_set
# generate the train and validation datasets
qa_train
=
gen_qa_set
(
self
.
xtrain
,
self
.
ytrain
)
qa_val
=
gen_qa_set
(
self
.
xeval
,
self
.
yeval
)
os
.
chdir
(
self
.
finetune_path
)
self
.
_writeJSON
(
qa_train
,
"train"
+
end
)
self
.
_writeJSON
(
qa_val
,
"eval"
+
end
)
# modify the set structure
def
add_sep
(
x
,
y
):
for
i
,
_
in
enumerate
(
x
):
c
,
q
,
a
=
get_qca
(
x
,
y
,
i
)
x
[
i
]
=
q
+
"---"
+
c
y
[
i
]
=
a
add_sep
(
self
.
xtrain
,
self
.
ytrain
)
add_sep
(
self
.
xeval
,
self
.
yeval
)
add_sep
(
self
.
xtest
,
self
.
ytest
)
self
.
print_example
(
qa_train
)
# public functions
def
print_example
(
self
,
set
):
print
(
"Questions:"
)
print
(
set
[
"data"
][
"question"
][:
3
])
print
(
"Context:"
)
print
(
set
[
"data"
][
"context"
][:
3
])
print
(
"Answers:"
)
print
(
set
[
"data"
][
"answers"
][:
3
])
class
DatasetGPT
(
Dataset
):
def
__init__
(
self
,
home_path
,
data_path
=
"./datasets/"
,
split
=
0.9
):
os
.
chdir
(
home_path
)
os
.
chdir
(
"./transformers/examples/"
)
os
.
chdir
(
"./language-modeling"
)
finetune_path
=
os
.
getcwd
()
super
()
.
__init__
(
home_path
,
finetune_path
,
data_path
,
split
)
# private functions
# protected functions
def
_save_data
(
self
,
x
,
y
):
return
x
,
y
def
_format_data
(
self
,
end
):
# combine the list entries to a single long string
train_str
=
""
eval_str
=
""
data
=
[
self
.
xtrain
,
self
.
ytrain
]
print
(
"The dataset has {} entries"
.
format
(
len
(
data
[
0
])))
for
i
,
_
in
enumerate
(
data
[
0
]):
train_str
+=
data
[
0
][
i
]
# from x set
train_str
+=
data
[
1
][
i
]
# from y set
data
=
[
self
.
xeval
,
self
.
yeval
]
print
(
"The dataset has {} entries"
.
format
(
len
(
data
[
0
])))
for
i
,
_
in
enumerate
(
data
[
0
]):
eval_str
+=
data
[
0
][
i
]
# from x set
eval_str
+=
data
[
1
][
i
]
# from y set
# save the training and evaluation files
os
.
chdir
(
self
.
finetune_path
)
print
(
"start saving files"
)
self
.
_writeTXT
(
train_str
,
"train"
+
end
)
self
.
_writeTXT
(
eval_str
,
"eval"
+
end
)
print
(
"saved files"
)
# print an example
self
.
print_example
(
train_str
)
# public functions
def
print_example
(
self
,
set
):
print
(
set
[:
5000
])
Event Timeline
Log In to Comment