diff --git a/main_project/gpt2_colab/gen_ch.py b/main_project/gpt2_colab/gen_ch.py
index 69c22d1..36f31da 100644
--- a/main_project/gpt2_colab/gen_ch.py
+++ b/main_project/gpt2_colab/gen_ch.py
@@ -1,98 +1,104 @@
import random
from gen_master import *
class GenDataCH(GenData):
def __init__(self, home_path):
super().__init__(home_path)
self.abrev = "ch"
self.keys = ["surname", "givenName", "dateOfBirth", "height", "placeOfOrigin", "dateOfIssue",
"dateOfExpiry", "sex", "identityCard", "nationality"]
self.keys_all = ["surname", "givenName", "dateOfBirth", "height", "placeOfOrigin", "dateOfIssue",
"dateOfExpiry", "sex", "identityCard", "nationality", "code"]
self.schema = {
"surname": "NAME - NOM - COGNOME - NUM - SURNAME",
"givenName": "VORNAME(N) - PRENOMS(S) - NOME(I) - PRENUM(S) - GIVEN NAME(S)",
"dateOfBirth": "GEBURTSDATUM - DATE DE NAISSANCE - DATA DI NASCITA - DATA DA NASCHIENTSCHA - DATE OF BIRTH",
"height": "GRÖSSE - TAILLE - STATURA - GRONDEZZA - HEIGHT",
"placeOfOrigin": "HEIMATORT - LIEU D'ORIGINE - LUOGO DI ATTINENZA - LIEU D'ORIGIN - PLACE OF ORIGIN",
"authority": "BEHÖRDE - AUTORITÉ - AUTORITÀ - AUTORIDAD - AUTHORITY",
"dateOfIssue": "AUSGESTELLT AM - DÉLIVERÉE LE - RILASCIATA IL - EMESSA ILS - DATE OF ISSUE",
"dateOfExpiry": "GÜLTIG BIS - DATE D'EXPIRATION - DATA DI SCADENZA - DATA DA SCADENZA - DATE OF EXPIRY",
"nationality": "NATIONALITÄT - NATIONALITÉ - CITTADINAZA - NAZIUNALITAD - NATIONALITY",
"identityCard": "IDENTITÄTSKARTE - CARTE D'IDENTITÉ - CARTA DiDENTITÀ - CARTA D'IDENTITAD - IDENTITY CARD",
"sex": "GESCHLECHT - SEE - SESSO - SCHLATTAINA - SEX",
"code": "CODE"
}
# protected functions
def _get_firstName(self):
with open('data.json', 'r') as fp:
data = json.load(fp)
random.seed(42)
# only keep first name, sometimes there are "Bob and Anna"
first_names = [x.split(', ', 2)[1] for x in data["name"] if (", " in x)]
first_names = [x for x in first_names if x is not None]
first_names = [x.split(' ')[0] for x in first_names]
return first_names
def _get_lastName(self):
with open('data.json', 'r') as fp:
data = json.load(fp)
random.seed(42)
last_names = [x.split(', ', 2)[0] for x in data["name"] if (", " in x)]
last_names = [x for x in last_names if x is not None]
return last_names
def _get_city(self):
with open('data.json', 'r') as fp:
data = json.load(fp)
random.seed(42)
place_of_origion = [x for x in data["location"] if (x is not None)]
return place_of_origion
def _gen_date_(self, d, m, y):
return self._strnum(d) + " " + self._strnum(m) + " " + str(y)[2:4]
def _gen_identityCard(self):
abc = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
nb = abc[random.randint(0, len(abc) - 1)]
nb += self._strnum(random.randint(100, 9999999), min_len=7)
return nb
- def _get_answer(self, key):
+ def _get_answer(self, key, set=None):
# is not yet selfconsistent : age-height; dateofIssue-dateof Expiery
if key == "surname":
- return self.fake_data["last name"]
+ if set is None:
+ return self.fake_data["last name"]
+ return self.data_sets[set]["last name"]
elif key == "givenName":
- return self.fake_data["first name"]
+ if set is None:
+ return self.fake_data["first name"]
+ return self.data_sets[set]["first name"]
elif key == "dateOfBirth":
return self._gen_date(1940, 2020)
elif key == "height":
return [random.randint(100, 180)]
elif key == "placeOfOrigin":
- return self.fake_data["city"]
+ if set is None:
+ return self.fake_data["city"]
+ return self.data_sets[set]["city"]
elif key == "dateOfIssue":
return self._gen_date(2010, 2020)
elif key == "dateOfExpiry":
return self._gen_date(2020, 2030)
elif key == "sex":
return ["M", "F"]
elif key == "identityCard":
return [self._gen_identityCard()]
elif key == "nationality":
return ["Schweiz - Suisse - Svizzera - Svizra - Switzerland"]
else:
raise NotImplementedError("key not known")
diff --git a/main_project/gpt2_colab/gen_de.py b/main_project/gpt2_colab/gen_de.py
index 1545760..b3812e9 100644
--- a/main_project/gpt2_colab/gen_de.py
+++ b/main_project/gpt2_colab/gen_de.py
@@ -1,97 +1,101 @@
import random
from gen_master import *
class GenDataDE(GenData):
def __init__(self, home_path):
super().__init__(home_path)
self.abrev = "de"
self.keys = ["surname", "givenName", "dateOfBirth", "height", "dateOfIssue", "dateOfExpiry",
"identityCard", "eyeColor", "placeOfBirth", "nationality"]
self.keys_all = ["surname", "givenName", "dateOfBirth", "height", "authority", "dateOfIssue", "dateOfExpiry",
"identityCard", "signature", "eyeColor", "placeOfBirth", "ArtistName", "address", "nationality"]
self.schema = {
"surname": "Name - Surname - Nom",
"givenName": "Vornamen - Given names - Prénoms",
"dateOfBirth": "Geburtstag - Date of birth - Date de naissance",
"height": "Größe - Height - Taille",
"authority": "Behörde - Authority - Autorité",
"dateOfIssue": "Datum - Date - Date",
"dateOfExpiry": "Gültig bis - Date of expiry - Date d'expiration",
"identityCard": "PERSONALAUSWEIS - IDENTITY CARD - CARTE D'IDENTITE",
"signature": "Unterschrit der Inhaberin/des Inhabers - Signature of bearer - Signature de la titulaire/du titulaire",
"eyeColor": "Aufgenfarbe - Colour of eyes - Couleur des yeux",
"placeOfBirth": "Geburtsort - Place of birth - Lieu de naissance",
"ArtistName": "Ordens- oder Künstlername/Religious name or pseudonym/ Nom de religion ou pseudonyme",
"address": "Anschrift - Adress - Adresse",
"nationality": "Staatsangehörigkeit - Nationality - Nationalité",
}
# protected functions
def _get_firstName(self):
with open('data_germany.json', 'r') as fp:
data = json.load(fp)
random.seed(42)
# only keep first name, sometimes there are "Bob and Anna"
first_names = [x.split(', ', 2)[1] for x in data["name"] if (", " in x)]
first_names = [x for x in first_names if x is not None]
first_names = [x.split(' ')[0] for x in first_names]
return first_names
def _get_lastName(self):
with open('data_germany.json', 'r') as fp:
data = json.load(fp)
random.seed(42)
last_names = [x.split(', ', 2)[0] for x in data["name"] if (", " in x)]
last_names = [x for x in last_names if x is not None]
return last_names
def _get_city(self):
with open('data_geneva_city.json', 'r') as fp:
data = json.load(fp)
return data['region_de']
def _gen_date_(self, d, m, y):
return self._strnum(d) + "." + self._strnum(m) + "." + self._strnum(y)
def _gen_identityCard(self):
abc = "ABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890123456789012345678901234567890123456789"
nb = ""
while len(nb) < 9:
nb += abc[random.randint(0, len(abc) - 1)]
return nb
- def _get_answer(self, key):
- ["surname", "givenName", "dateOfBirth", "height", "dateOfIssue", "dateOfExpiry",
- "identityCard", "eyeColor", "placeOfBirth", "nationality"]
+ def _get_answer(self, key, set=None):
# is not yet selfconsistent : age-height; dateofIssue-dateof Expiery
if key == "surname":
- return self.fake_data["last name"]
+ if set is None:
+ return self.fake_data["last name"]
+ return self.data_sets[set]["last name"]
elif key == "givenName":
- return self.fake_data["first name"]
+ if set is None:
+ return self.fake_data["first name"]
+ return self.data_sets[set]["first name"]
elif key == "dateOfBirth":
return self._gen_date(1940, 2020)
elif key == "height":
return [random.randint(100, 180)]
elif key == "placeOfBirth":
- return self.fake_data["city"]
+ if set is None:
+ return self.fake_data["city"]
+ return self.data_sets[set]["city"]
elif key == "dateOfIssue":
return self._gen_date(2010, 2020)
elif key == "dateOfExpiry":
return self._gen_date(2020, 2030)
elif key == "identityCard":
return [self._gen_identityCard()]
elif key == "nationality":
return ["DEUTSCH"]
elif key == "eyeColor":
return ["bernstein", "grün", "grünbraun", "grau", "blau", "hellbraun", "hellblau", "blaugrün"]
else:
raise NotImplementedError("key not known")
diff --git a/main_project/gpt2_colab/gen_fake_data.py b/main_project/gpt2_colab/gen_fake_data.py
index 084cd3c..c78bd05 100644
--- a/main_project/gpt2_colab/gen_fake_data.py
+++ b/main_project/gpt2_colab/gen_fake_data.py
@@ -1,32 +1,36 @@
from gen_master import *
from gen_ch import *
from gen_fi import *
from gen_fr import *
from gen_de import *
from gen_it import *
import os
homepath = os.getcwd()
CH = GenDataCH(homepath)
FI = GenDataFI(homepath)
FR = GenDataFR(homepath)
DE = GenDataDE(homepath)
IT = GenDataIT(homepath)
countries = [CH, FI, FR, DE, IT]
for c in countries:
c.load_real_data()
for c in countries:
- c.run_fakeGen(250, countries)
+ c.run_fakeGen(2500, countries)
+
+for c in countries:
+ c.load_real_data()
print("CH_part_"*10)
-CH.fill_schema(nb_keys=3)
+# CH.fill_schema(nb_keys=3)
-for c in countries:
+
+for c in [DE, IT]:
print()
print("{}_full_".format(c._get_country()) * 10)
- c.fill_schema(nb_keys="all")
\ No newline at end of file
+ c.fill_schema(nb_keys="all", dir=c._get_country().upper())
diff --git a/main_project/gpt2_colab/gen_fi.py b/main_project/gpt2_colab/gen_fi.py
index 97671dd..57b98ef 100644
--- a/main_project/gpt2_colab/gen_fi.py
+++ b/main_project/gpt2_colab/gen_fi.py
@@ -1,111 +1,115 @@
import random
from gen_master import *
class GenDataFI(GenData):
def __init__(self, home_path):
super().__init__(home_path)
self.abrev = "fi"
self.keys = ["surname", "givenName", "dateOfBirth", "dateOfIssue", "dateOfExpiry", "sex", "identityCard", "nationality"]
self.keys_all = ["surname", "givenName", "dateOfBirth", "dateOfIssue", "dateOfExpiry", "sex", "identityCard",
"nationality", "one", "two"]
self.schema = {
"surname": "SUKUNIMI - EFTERNAMN",
"givenName": "ETUNIMET - FÖRNAMN",
"dateOfBirth": "SYNTYMÄAIKA - FÖDELSEDATUM",
"dateOfIssue": "MYÖNNETTY - UTFÄRDAT",
"dateOfExpiry": "VOIMASSA - GILTIGT T.O.M.",
"nationality": "KANSALAISUUS - NATIONALITET",
"identityCard": "KORTTINUMERO - KORTNUMMER",
"sex": "SUKUPUOLI - KÖN",
"one": "TUNNUS - KOD",
"two": "CAN"
}
# protected functions
def _get_firstName(self):
with open('data_fi.json', 'r') as fp:
data = json.load(fp)
first = []
last = []
for n in data['names']:
tmp = n.split(" ")
# if more then 2 spaces are used, it's most likely not a name
if len(tmp) != 2:
pass
else:
if len(tmp[0]) > 3:
first.append(tmp[0])
if len(tmp[1]) > 3:
last.append(tmp[1])
return first
def _get_lastName(self):
with open('data_fi.json', 'r') as fp:
data = json.load(fp)
first = []
last = []
for n in data['names']:
tmp = n.split(" ")
# if more then 2 spaces are used, it's most likely not a name
if len(tmp) != 2:
pass
else:
if len(tmp[0]) > 3:
first.append(tmp[0])
if len(tmp[1]) > 3:
last.append(tmp[1])
return last
def _get_city(self):
with open('data_fi.json', 'r') as fp:
data = json.load(fp)
locations_fi = data['addresses']
city_fi = []
for x in locations_fi:
tmp = x.split(" ")
tmp = tmp[-1].lower()
tmp = tmp[0].upper() + tmp[1:]
city_fi.append(tmp)
with open('data_geneva_city.json', 'r') as fp:
data = json.load(fp)
city_fi += data['region_fi']
return city_fi
def _gen_date_(self, d, m, y):
return self._strnum(d) + "." + self._strnum(m) + "." + self._strnum(y)
def _gen_identityCard(self):
return random.randint(100000000, 999999999)
- def _get_answer(self, key):
+ def _get_answer(self, key, set=None):
# is not yet selfconsistent : age-height; dateofIssue-dateof Expiery
if key == "surname":
- return self.fake_data["last name"]
+ if set is None:
+ return self.fake_data["last name"]
+ return self.data_sets[set]["last name"]
elif key == "givenName":
- return self.fake_data["first name"]
+ if set is None:
+ return self.fake_data["first name"]
+ return self.data_sets[set]["first name"]
elif key == "dateOfBirth":
return self._gen_date(1940, 2020)
elif key == "dateOfIssue":
return self._gen_date(2010, 2020)
elif key == "dateOfExpiry":
return self._gen_date(2020, 2030)
elif key == "sex":
return ["M", "F"]
elif key == "identityCard":
return [self._gen_identityCard()]
elif key == "nationality":
return ["FIN"]
else:
raise NotImplementedError("key not known")
\ No newline at end of file
diff --git a/main_project/gpt2_colab/gen_fr.py b/main_project/gpt2_colab/gen_fr.py
index a6c7c05..8c51d56 100644
--- a/main_project/gpt2_colab/gen_fr.py
+++ b/main_project/gpt2_colab/gen_fr.py
@@ -1,91 +1,97 @@
import random
from gen_master import *
class GenDataFR(GenData):
def __init__(self, home_path):
super().__init__(home_path)
self.abrev = "fr"
self.keys = ["surname", "givenName", "dateOfBirth", "height", "placeOfBirth", "dateOfIssue", "dateOfExpiry", "sex",
"identityCard", "nationality"]
self.keys_all = ["surname", "givenName", "dateOfBirth", "height", "placeOfBirth", "dateOfIssue", "dateOfExpiry", "sex",
"identityCard", "nationality"]
self.schema = {
"surname": "Nom",
"givenName": "Prénom(s)",
"dateOfBirth": "Né(e) le",
"height": "Taille",
"placeOfBirth": "à",
"authority": "par",
"dateOfIssue": "délivrée le",
"dateOfExpiry": "Carte valable jusqu'au",
"nationality": "Nationalité",
"identityCard": "CARTE NATIONALE D'IDENTITÉ N°",
"sex": "Sexe",
"signature": "Signature du titulaire",
"signature2": "Signature de l'autorité",
"address": "Adresse"
}
# protected functions
def _get_firstName(self):
with open('data_geneva.json', 'r') as fp:
data = json.load(fp)
random.seed(42)
# only keep first name, sometimes there are "Bob and Anna"
first_names = [x.split(', ', 2)[1] for x in data["name"] if (", " in x)]
first_names = [x for x in first_names if x is not None]
first_names = [x.split(' ')[0] for x in first_names]
return first_names
def _get_lastName(self):
with open('data_geneva.json', 'r') as fp:
data = json.load(fp)
random.seed(42)
last_names = [x.split(', ', 2)[0] for x in data["name"] if (", " in x)]
last_names = [x for x in last_names if x is not None]
return last_names
def _get_city(self):
with open('data_geneva_city.json', 'r') as fp:
data = json.load(fp)
return data['region']
def _gen_date_(self, d, m, y):
return self._strnum(d) + "." + self._strnum(m) + "." + self._strnum(y)
def _gen_identityCard(self):
return random.randint(100000000000, 999999999999)
- def _get_answer(self, key):
+ def _get_answer(self, key, set=None):
# is not yet selfconsistent : age-height; dateofIssue-dateof Expiery
if key == "surname":
- return self.fake_data["last name"]
+ if set is None:
+ return self.fake_data["last name"]
+ return self.data_sets[set]["last name"]
elif key == "givenName":
- return self.fake_data["first name"]
+ if set is None:
+ return self.fake_data["first name"]
+ return self.data_sets[set]["first name"]
elif key == "dateOfBirth":
return self._gen_date(1940, 2020)
elif key == "height":
return [random.randint(100, 180)]
elif key == "placeOfBirth":
- return self.fake_data["city"]
+ if set is None:
+ return self.fake_data["city"]
+ return self.data_sets[set]["city"]
elif key == "dateOfIssue":
return self._gen_date(2010, 2020)
elif key == "dateOfExpiry":
return self._gen_date(2020, 2030)
elif key == "sex":
return ["M", "F"]
elif key == "identityCard":
return [self._gen_identityCard()]
elif key == "nationality":
return ["Française"]
else:
raise NotImplementedError("key not known")
\ No newline at end of file
diff --git a/main_project/gpt2_colab/gen_it.py b/main_project/gpt2_colab/gen_it.py
index 523ef77..113efe9 100644
--- a/main_project/gpt2_colab/gen_it.py
+++ b/main_project/gpt2_colab/gen_it.py
@@ -1,140 +1,146 @@
import random
from gen_master import *
import copy
class GenDataIT(GenData):
def __init__(self, home_path):
super().__init__(home_path)
self.abrev = "it"
self.keys = ["surname", "givenName", "dateOfBirth", "height", "sex", "dateOfIssue", "dateOfExpiry", "nationality",
"identityCard", "placeOfBirth"]
self.keys_all = ["surname", "givenName", "dateOfBirth", "height", "sex", "authority", "dateOfIssue", "dateOfExpiry", "nationality",
"identityCard", "signature", "parents", "fiscalCode", "birthCode", "address"]
self.schema = {
"surname": "COGNOME - SURNAME",
"givenName": "NOME - NAME",
"dateOfBirth": "LUOGO E DATA DI NASCITA - PLACE AND DATE OF BIRTH",
"height": "STATURA - HEIGHT",
"sex": "SESSO - SEX",
"authority": "COMUNE DI - MUNICIPALITY",
"dateOfIssue": "EMISSIONE - ISSUING",
"dateOfExpiry": "SCADENZA - EXPIRY",
"nationality": "CITTADINANZA - NATIONALITY",
"identityCard": "CARTA DI IDENTITA - IDENTITY CARD",
"signature": "FIRMA DEL TITOLARE - HOLDER'S SIGNATURE",
"parents": "COGNOME E NOME DE GENITORI O DI CHI NE FA LE VECI - SURNAME AND NAME OF PARENTS OR LEGAL GUARDIAN",
"fiscalCode": "CODICE FISCALE - FISCAL CODE",
"birthCode": "ESTREMAKATTO DI NASCITA",
"address": "INDIRIZZO DI RESIDENZA - RESIDENCE",
}
# protected functions
def _get_firstName(self):
with open('data_italy.json', 'r') as fp:
data = json.load(fp)
random.seed(42)
# only keep first name, sometimes there are "Bob and Anna"
first_names = [x.split(', ', 2)[1] for x in data["name"] if (", " in x)]
first_names = [x for x in first_names if x is not None]
first_names = [x.split(' ')[0] for x in first_names]
return first_names
def _get_lastName(self):
with open('data_italy.json', 'r') as fp:
data = json.load(fp)
random.seed(42)
last_names = [x.split(', ', 2)[0] for x in data["name"] if (", " in x)]
last_names = [x for x in last_names if x is not None]
return last_names
def _get_city(self):
with open('data_geneva_city.json', 'r') as fp:
data = json.load(fp)
return data['region_it']
def _gen_date_(self, d, m, y):
return self._strnum(d) + "." + self._strnum(m) + "." + self._strnum(y)
def _gen_identityCard(self):
abc = "ABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890123456789012345678901234567890123456789"
nb = ""
while len(nb) < 9:
nb += abc[random.randint(0, len(abc) - 1)]
return nb
- def _get_answer(self, key):
+ def _get_answer(self, key, set=None):
# is not yet selfconsistent : age-height; dateofIssue-dateof Expiery
if key == "surname":
- return self.fake_data["last name"]
+ if set is None:
+ return self.fake_data["last name"]
+ return self.data_sets[set]["last name"]
elif key == "givenName":
- return self.fake_data["first name"]
+ if set is None:
+ return self.fake_data["first name"]
+ return self.data_sets[set]["first name"]
elif key == "dateOfBirth":
- return self.fake_data["city"], self._gen_date(1940, 2020)
+ if set is None:
+ return self.fake_data["city"], self._gen_date(1940, 2020)
+ return self.data_sets[set]["city"], self._gen_date(1940, 2020)
elif key == "height":
return [random.randint(100, 180)]
elif key == "dateOfIssue":
return self._gen_date(2010, 2020)
elif key == "dateOfExpiry":
return self._gen_date(2020, 2030)
elif key == "sex":
return ["M", "F"]
elif key == "identityCard":
return [self._gen_identityCard()]
elif key == "nationality":
return ["ITA"]
else:
raise NotImplementedError("key not known")
def _fill_schema(self, question, nb_keys="all"):
# special case of italy, because place of birth and date of birth are a combined label
tmp = copy.deepcopy(self.keys)
tmp.remove("placeOfBirth")
if nb_keys != "all":
random.shuffle(tmp)
tmp = tmp[:nb_keys]
if question in tmp:
pass
elif question == "placeOfBirth" and "dateOfBirth" in tmp:
pass
elif question == "placeOfBirth":
tmp[random.randint(0, len(tmp) - 1)] = "dateOfBirth"
else:
# replace a random entry
tmp[random.randint(0, len(tmp) - 1)] = question
context = ""
answer = ""
for key in tmp:
if key != "dateOfBirth":
ans = self._get_answer(key)
ans = ans[random.randint(0, len(ans)-1)]
if key == question:
answer = ans
else:
ans1, ans2 = self._get_answer(key)
ans1 = ans1[random.randint(0, len(ans1))]
ans2 = ans2[random.randint(0, len(ans2))]
ans = str(ans1) + " " + str(ans2)
if question == "dateOfBirth":
answer = ans2
elif question == "placeOfBirth":
answer = ans1
context += self.schema[key] + ": " + str(ans) + "; "
return context, question, answer
diff --git a/main_project/gpt2_colab/gen_master.py b/main_project/gpt2_colab/gen_master.py
index 886e70d..d36d960 100644
--- a/main_project/gpt2_colab/gen_master.py
+++ b/main_project/gpt2_colab/gen_master.py
@@ -1,371 +1,477 @@
import json
import os
import random
import copy
from abc import abstractmethod
from transformers import TFGPT2LMHeadModel
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
class GenData:
def __init__(self, home_path):
self.home_path = home_path
+ os.chdir(self.home_path)
+ os.chdir("./datasets")
+ self.data_path = os.getcwd()
+
os.chdir(self.home_path)
os.chdir("./transformers/examples/")
os.chdir("./language-modeling")
self.finetune_path = os.getcwd()
self.nbEpochs = 5
self.outModelName = "genFakeData5"
os.chdir(self.home_path)
os.chdir("./models")
self.model_path = os.getcwd()
os.chdir(self.home_path)
os.chdir("./crawl_data")
self.crawl_path = os.getcwd()
self.data = {"first name": [], "last name": [], "city": []}
self.fake_data = {"first name": [], "last name": [], "city": []}
self.model = None
self.tokenizer = None
self.keys = None
self.keys_all = None
self.schema = None
self.abrev = None
+ self.tt = ["train", "test1", "test2", "test3", "test4"]
+
+ self.data_sets = {}
+ for t in self.tt:
+ self.data_sets[t] = {"first name": [], "last name": [], "city": []}
+
+ self.tt_s = [0, 1500, 250, 250, 250, 250]
+
+ # starting index
+ for i, _ in enumerate(self.tt_s[:-1]):
+ self.tt_s[i+1] += self.tt_s[i]
+
+ # starting index as fraction of total length
+ for i, _ in enumerate(self.tt_s):
+ self.tt_s[i] = self.tt_s[i]/self.tt_s[-1]
+
# Private methods
# protected functions
def _strnum(self, i, min_len=2):
i = str(i)
while (len(i) < min_len):
i = "0" + i
return i
@abstractmethod
def _get_firstName(self):
raise NotImplementedError("abstract method")
@abstractmethod
def _get_lastName(self):
raise NotImplementedError("abstract method")
@abstractmethod
def _get_city(self):
raise NotImplementedError("abstract method")
@abstractmethod
def _get_answer(self, key):
raise NotImplementedError("abstract method")
- def _fill_schema(self, question, nb_keys="all"):
+ def _fill_schema(self, question, nb_keys="all", set_=None):
tmp = copy.deepcopy(self.keys)
if nb_keys != "all":
random.shuffle(tmp)
tmp = tmp[:nb_keys]
if question in tmp:
pass
else:
# replace a random entry
tmp[random.randint(0, len(tmp) - 1)] = question
context = ""
answer = ""
for key in tmp:
- ans = self._get_answer(key)
+ ans = self._get_answer(key, set=set_)
ans = ans[random.randint(0, len(ans)-1)]
context += self.schema[key] + ": " + str(ans) + "; "
if key == question:
answer = ans
- return context, question, answer
+ return context, question, str(answer)
def __doubleCapital(self, word):
for i, l in enumerate(word[:-1]):
if l.isupper() and word[i+1].isupper() and l.lower() in "abcdefghijklmnopqrstuvwxyz" \
and word[i+1].lower() in "abcdefghijklmnopqrstuvwxyz":
return True
+ if i>0 and l.isupper() and word[i-1] not in [" ", ".", "'"]:
+ return True
return False
-
def _clean_name(self, dirty, countries=[]):
# collect all city names
city_names = []
for c in countries:
data, _ = c._get_data()
city_names += data["city"]
clean = []
for d in dirty:
# Name contains spaces and is not "van ..."
if " " in d and "van" not in d:
pass
# first letter is lowercase or "." is in word, or it's just two letters
elif d[0].upper() != d[0] or "." in d or len(d) <= 2 or "(" in d or ")" in d:
pass
+ elif d[0].lower() not in "abcdefghijklmnopqrstuvwxyz":
+ pass
# contains numbers
elif any(char.isdigit() for char in d):
pass
# everything is in upper case
elif all(char.isupper() for char in d):
pass
# specific for finnish crawl
elif "A-Z" in d or "oy" == d.lower() or "studio" in d.lower():
pass
elif ">" in d or "<" in d or "/" in d:
pass
elif self.__doubleCapital(d):
pass
elif d in city_names:
pass
else:
clean.append(d)
return clean
- def _clean_city(self, dirty, countries=[]):
+ def _clean_city(self, dirty, countries=[], countries2=[]):
# collect all city names
other_city_names = []
for c in countries:
data, _ = c._get_data()
other_city_names += data["city"]
+ names = []
+ for c in countries:
+ data, _ = c._get_data()
+ names += data["first name"]
+ names += data["last name"]
+
+ for c in countries2:
+ data, _ = c._get_data()
+ names += data["first name"]
+ names += data["last name"]
+
clean = []
for d in dirty:
# first letter is lowercase, or it's just two letters
if d[0].upper() != d[0] or len(d) <= 2 or "(" in d or ")" in d:
pass
+ # first letter is lowercase or it's just two letters
+ elif d[0].upper() != d[0] or len(d) <= 2:
+ pass
+ elif d[0].lower() not in "abcdefghijklmnopqrstuvwxyz":
+ pass
# contains numbers
elif any(char.isdigit() for char in d):
pass
# everything is in upper case
elif all(char.isupper() for char in d):
pass
elif ">" in d or "<" in d or "/" in d:
pass
elif self.__doubleCapital(d):
pass
elif d in other_city_names:
pass
+ # dont accept persons names
+ elif d in names:
+ pass
else:
clean.append(d)
return clean
@abstractmethod
def _gen_date_(self, d, m, y):
raise NotImplementedError("abstract method")
@abstractmethod
def _gen_identityCard(self):
raise NotImplementedError("abstract method")
def _gen_date(self, from_y, to_y):
# year should be given in the format 2012 (4 digits)
dates = []
for d in range(1, 32):
for m in range(1, 13):
for y in range(from_y + 1, to_y + 1):
dates.append(self._gen_date_(d, m, y))
return dates
def _gpt_train_entry(self, ident, nat, list):
keywords = ["", "", "", "<|endoftext|>"]
out = []
for l in list:
out.append(keywords[0] + str(list[:10]) + "<{}><{}>".format(ident, nat) + l + keywords[3])
return out
def _gpt_save(self, list):
long_str = ""
for x in list:
try:
long_str += x + "\n"
except:
pass
import io
with io.open("gen_data_train.txt", 'w', encoding="utf-8") as f:
# with open("gen_data_train.txt", 'w') as f:
f.write(long_str)
def _get_data(self):
return self.data, self.fake_data
def _set_data(self, data, fake_data):
if data is not None:
self.data = data
if fake_data is not None:
self.fake_data = fake_data
def _get_country(self):
return self.abrev
def _fake_gen(self, ident, nat, list):
keywords = ["", "", "", "<|endoftext|>"]
- prompt = keywords[0] + str(list[:10]) + "<{}><{}>".format(ident, nat) # + l + keywords[3]
+ prompt = keywords[0] + str(list) + "<{}><{}>".format(ident, nat) # + l + keywords[3]
input_ids = self.tokenizer.encode(prompt, return_tensors='tf')
generated_text_samples = self.model.generate(
input_ids,
max_length=len(input_ids[0]) + 50,
num_return_sequences=1,
- no_repeat_ngram_size=0,
- repetition_penalty=1.0,
+ no_repeat_ngram_size=3,
+ repetition_penalty=1.2,
top_p=1.0,
- temperature=1.0,
+ temperature=1.5,
do_sample=True,
top_k=0,
early_stopping=True
)
answer = self.tokenizer.decode(generated_text_samples[0])
answer = answer.replace(prompt, "")
answer = answer.replace("<|endoftext|>", "")
return answer
# Public methods
def rm_dub(self, dirty): # remove dublicates
return list(set(dirty))
def load_real_data(self):
os.chdir(self.crawl_path)
self.data["first name"] = self._clean_name(self.rm_dub(self._get_firstName()))
self.data["last name"] = self._clean_name(self.rm_dub(self._get_lastName()))
self.data["city"] = self._clean_city(self.rm_dub(self._get_city()))
+ try:
+ for t in self.tt:
+ filename = "fake_id_data_" + self.abrev + "_" + t + ".json"
+ print(filename)
+ with open(filename, 'r') as f:
+ self.data_sets[t] = json.load(f)
+ except:
+ print("couldn't load test and train sets")
+
def train_fakeGen(self, countries):
# countries is a list of child classes
train_data = []
for ident in self.data.keys():
for country in countries:
data, _ = country._get_data()
random.shuffle(data)
if len(data) > 5000:
data = data[:5000]
else:
print("Warning, label <{}> in nationality <{}> only has {} entries".format(
ident, country._get_country(), len(data)))
train_data.append(self._gpt_train_entry(ident, country._get_country(), data))
random.shuffle(train_data)
self._gpt_save(train_data)
os.chdir(self.finetune_path)
with open("real_id_data" + ".json", "w") as f:
json.dump(data, f)
cmd = "python run_clm.py \
--model_type {} \
--train_file \"{}\" \
--do_train \
--per_gpu_train_batch_size 1 \
--save_steps -1 \
--num_train_epochs {} \
--fp16 \
--tokenizer_name gpt2 \
--model_name_or_path gpt2 \
--output_dir=\"{}\" \
".format(
"gpt2",
"gen_data_train" + ".txt",
self.nbEpochs,
self.model_path + "/" + self.outModelName)
print(cmd)
os.system(cmd)
def run_fakeGen(self, nb, countries):
os.chdir(self.home_path)
self.model = TFGPT2LMHeadModel.from_pretrained(self.model_path + "/" + self.outModelName, from_pt=True)
self.tokenizer = AutoTokenizer.from_pretrained(self.model_path + "/" + self.outModelName)
os.chdir(self.crawl_path)
#try:
f = 'fake_id_data_'+ self.abrev +'.json'
print(f)
with open(f, 'r') as fp:
fake_data = json.load(fp)
# apply new cleaning rules
for key in fake_data.keys():
if key in ["first name", "last name"]:
fake_data[key] = self._clean_name(fake_data[key], countries)
if key in ["city"]:
- fake_data[key] = self._clean_city(fake_data[key], [c for c in countries if c._get_country() != self.abrev])
+ fake_data[key] = self._clean_city(fake_data[key],
+ [c for c in countries if c._get_country() != self.abrev],
+ [c for c in countries if c._get_country() == self.abrev])
self.fake_data = fake_data
print("{}:\n# first names: {}, # last names: {}, # cities: {}".format(self.abrev,
len(fake_data["first name"]),
len(fake_data["last name"]),
len(fake_data["city"])))
#except:
# print("couldn't find existing fake data entries")
for ident in self.data.keys():
print("{}, {}".format(ident, self.abrev))
while len(self.fake_data[ident]) < nb:
data, fake_data = self._get_data()
- tmp = self._fake_gen(ident, self.abrev, data[ident])
+ # tmp = self._fake_gen(ident, self.abrev, data[ident])
+ if len(self.fake_data[ident]) < 100:
+ tmp = self._fake_gen(ident, self.abrev, data[ident][:10])
+ else:
+ sample_fake = copy.deepcopy(self.fake_data[ident])
+ random.shuffle(sample_fake)
+
+ tmp = self._fake_gen(ident, self.abrev, sample_fake[:10]+data[ident][:10])
# clean names
try:
if ident in ["first name", "last name"]:
tmp = self._clean_name([tmp], countries)[0]
if ident in ["city"]:
- tmp = self._clean_city([tmp], [c for c in countries if c._get_country() != self.abrev])[0]
+ tmp = self._clean_city([tmp],
+ [c for c in countries if c._get_country() != self.abrev],
+ [c for c in countries if c._get_country() == self.abrev])[0]
fake_data[ident].append(tmp)
except:
# if the clean function returns an empty list
pass
fake_data[ident] = self.rm_dub(fake_data[ident])
self.fake_data = fake_data
new = [tmp for tmp in fake_data[ident] if tmp not in data[ident]]
print("There have been {} new generation, and {} identical to the test set".format(len(new), len(
fake_data[ident]) - len(new)))
data, fake_data = self._get_data()
new = [tmp for tmp in fake_data[ident] if tmp not in data[ident]]
print(new)
print("There have been {} new generation, and {} identical to the test set".format(len(new), len(
fake_data[ident]) - len(new)))
print("{}, {} -- ended search".format(ident, self.abrev))
os.chdir(self.crawl_path)
with open("fake_id_data_"+ self.abrev + ".json", "w") as f:
json.dump(fake_data, f)
- def fill_schema(self, keys="all", nb_keys="all"):
+ # split into train and test set
+
+ os.chdir(self.crawl_path)
+
+ shuffled_data = copy.deepcopy(fake_data)
+ for key in shuffled_data.keys():
+ random.shuffle(shuffled_data[key])
+
+ for i, t in enumerate(self.tt):
+ fake_data_subset = {"first name": [], "last name": [], "city": []}
+ for key in fake_data_subset.keys():
+ l = len(shuffled_data[key])
+ subset = shuffled_data[key][int(self.tt_s[i]*l):int(self.tt_s[i+1]*l)]
+ fake_data_subset[key] = subset
+
+ with open("fake_id_data_" + self.abrev + "_" + t + ".json", "w") as f:
+ json.dump(fake_data_subset, f)
+
+ def fill_schema(self, keys="all", nb_keys="all", dir="myDataset"):
if keys == "all":
keys = self.keys
- for k in keys:
- context, question, answer = self._fill_schema(k, nb_keys=nb_keys)
- print(context)
- print(question)
- print(answer)
\ No newline at end of file
+ for t in self.tt:
+ x = []
+ y = []
+
+ qa_set = {"data": {"question": [], "context": [], "answers": []}}
+ # generate a 1000 examples for each question
+ for _ in range(1000):
+ for k in keys:
+ context, question, answer = self._fill_schema(k, nb_keys=nb_keys, set_=t)
+ # print(context)
+ # print(question)
+ # print(answer)
+
+ keywords = ["", "", "", "<|endoftext|>"]
+ x.append(keywords[0] + context + keywords[2] + question + keywords[1])
+ y.append(answer + keywords[3])
+
+ qa_set["data"]["question"].append(question)
+ qa_set["data"]["context"].append(context)
+ qa_set["data"]["answers"].append({"answer_start": [context.find(answer)], "text": [answer]})
+
+ os.chdir(self.data_path)
+ if not os.path.exists(dir):
+ os.mkdir(dir)
+ os.chdir(dir)
+ with open("x_" + t + ".json", "w") as f:
+ json.dump(x, f)
+ with open("y_" + t + ".json", "w") as f:
+ json.dump(y, f)
+ with open("qa_" + t + ".json", "w") as f:
+ json.dump(qa_set, f)
\ No newline at end of file
diff --git a/main_project/gpt2_colab/main.py b/main_project/gpt2_colab/main.py
index 2b0a4e8..2e01578 100644
--- a/main_project/gpt2_colab/main.py
+++ b/main_project/gpt2_colab/main.py
@@ -1,331 +1,338 @@
# score per question category
# entropy per question (if it's always the same, don't accept it? Prolem with nationality i.e.)
import os
import load_files as lf
import model as mo
import json
import matplotlib.pyplot as plt
# choose the model type
MODEL = "gpt" # gpt, bert
# train the model from scratch or from a checkpoint
SCRATCH = False # True, False
# choose the checkpoint
-CHECKPOINT = "gpt2-large" # xlm-roberta-base, gpt2, None, roberta-base, "gpt2-medium", "gpt2-large", "gpt2-xl"
+CHECKPOINT = "gpt2" # xlm-roberta-base, gpt2, None, roberta-base, "gpt2-medium", "gpt2-large", "gpt2-xl"
# train the model in this run
TRAIN = True # True, False
# number of training epochs
EPOCHS = 1
# new model name
-NAME = "gpt_L_e_1_test" # xmlr_e_10_test, gpt_e_1_test
+NAME = "gpt_e1_CHpart" # xmlr_e_10_test, gpt_e_1_test
# probaModes to be tested
PROBA = ["longOk"] #, "longOk", ["mult", "forceNon0", "maxNon0"], ["mult", "longOk"]
# number of times the model should be retrained (0 is never)
RETRAIN = 0
home_path = os.getcwd()
# load a dataset
if MODEL == "gpt":
print("Start instances for a gpt2 model")
dataset = lf.DatasetGPT(home_path)
model = mo.GPTModel(home_path, printStep=-1)
elif MODEL == "bert":
print("Start instances for a bert model")
dataset = lf.DatasetBert(home_path)
model = mo.BertModel(home_path, printStep=-1)
else:
NotImplementedError("Model type not defined")
# load a dataset
dataset.load_data()
+# dataset.load_data(dir="fi_mixed_full", end="_fi")
# train a tokenizer from scratch
if SCRATCH:
NotImplementedError("Train a tokenizer")
SCRATCH = "loc of tok"
else:
SCRATCH = None
if TRAIN:
model.train(nbEpochs=EPOCHS,
outModelName=NAME,
startCheckpoint=CHECKPOINT,
dataEnd="",
tokenizerLocaction=SCRATCH)
# load the model
model.load_model(NAME)
all_scores = []
def retrain(model, dataset, NAME, CHECKPOINT, EPOCHS, nb_used=100, end="", onlySave=True, console=""):
xsure, ysure = model.getSureGuesses()
xtest, ytest = dataset.get_test()
if len(xsure) == 0:
print("{}\n{}\n{}\n{}\n{}".format("="*50, "="*50, "there are no sure guesses...", "="*50, "="*50))
return model, dataset, NAME, CHECKPOINT
while len(xsure) < 100:
xsure = xsure + xsure
ysure = ysure + ysure
dataset.save_data(xsure,
ysure,
NAME,
x_test=xtest[nb_used:],
y_test=ytest[nb_used:],
console=console,
)
if not onlySave:
CHECKPOINT = NAME
NAME += "_adapt_" + end
dataset.load_data(dir=CHECKPOINT, end="")
if end != "":
end = "_" + end
model.train(nbEpochs=EPOCHS,
outModelName=NAME,
startCheckpoint=CHECKPOINT,
tokenizerLocaction=CHECKPOINT,
dataEnd="",
)
# load the model
model.load_model(NAME)
return model, dataset, NAME, CHECKPOINT
def K_alpha(tp, tn, fp, fn, alpha):
K = []
for i, _ in enumerate(tp):
K.append(tp[i]/(tp[i]+fp[i]*alpha+tn[i]+fn[i]))
return K
def generate(model, dataset, NAME, RETRAIN, CHECKPOINT, EPOCHS, all_scores, end = "", nb_used=1000):
legend_color = {
"all": "black",
"sex": "goldenrod",
"height": "red",
"dateOfBirth": "plum",
"dateOfExpiry": "fuchsia",
"dateOfIssue": "deeppink",
"placeOfBirth": "darkgreen",
"surname": "lawngreen",
"givenName": "mediumseagreen",
"placeOfOrigin": "darkslategrey",
}
legend_name = {
"all": "all",
"sex": "sex",
"height": "height",
"dateOfBirth": "date of birth",
"dateOfExpiry": "date of expiry",
"dateOfIssue": "date of issue",
"placeOfBirth": "place of birth",
"surname": "last name",
"givenName": "first name",
"placeOfOrigin": "place of origin",
}
plt.rcParams.update({"text.usetex": True})
for m in PROBA:
model.load_model(NAME)
model.set_proba_mode(m)
NAME_r = NAME
for r in range(RETRAIN + 1):
scores_dict, scores, console = model.generate(dataset.get_test(), 0, nb_used)
all_scores.append(scores)
# print all scores
keys = scores_dict.keys()
buckets = len(scores_dict["all"]["count"])
lim = [x/buckets for x in range(buckets)]
for key in keys:
plt.title(key)
plt.plot(lim, scores_dict[key]["tp"], 'g+-', label="tp")
plt.plot(lim, scores_dict[key]["tn"], 'go--', label="tn")
plt.plot(lim, scores_dict[key]["fp"], 'r+-', label="fp")
plt.plot(lim, scores_dict[key]["fn"], 'ro--', label="fn")
tmp = [150, 500, 1000, 2000, 5000]
for upper in tmp:
if scores_dict[key]["tp"][0]+scores_dict[key]["fp"][0] < upper:
break
plt.ylim([0, upper])
plt.xlabel("network confidence score")
plt.ylabel("# examples")
plt.legend()
os.chdir(home_path)
plt.savefig("f_tp_{}_{}.eps".format(key, r), format="eps")
plt.savefig("f_tp_{}_{}.jpg".format(key, r), format="jpg")
plt.close()
# ---------------
plt.title(key)
plt.plot(lim, scores_dict[key]["f1"], 'go-', label="F1")
plt.plot(lim, scores_dict[key]["recall"], 'bx:', label="recall")
plt.plot(lim, scores_dict[key]["precision"], 'md--', label="precision")
plt.ylim([0,1])
plt.xlabel("network confidence score")
plt.ylabel("score")
plt.legend()
os.chdir(home_path)
plt.savefig("f_sc_{}_{}.eps".format(key, r), format="eps")
plt.savefig("f_sc_{}_{}.jpg".format(key, r), format="jpg")
plt.close()
# ----------------
plt.title(key)
plt.plot(lim, scores_dict[key]["count"], "ko-")
tmp = [150, 500, 1000, 2000, 5000]
for upper in tmp:
if scores_dict[key]["count"][0] < upper:
break
plt.ylim([0, upper])
plt.xlabel("network confidence score")
plt.ylabel("# examples")
os.chdir(home_path)
plt.savefig("f_conf_{}_{}.eps".format(key, r), format="eps")
plt.savefig("f_conf_{}_{}.jpg".format(key, r), format="jpg")
plt.close()
# ----------------
plt.title(key)
plt.plot(lim, K_alpha(scores_dict[key]["tp"],
scores_dict[key]["tn"],
scores_dict[key]["fp"],
scores_dict[key]["fn"],
1),
'co-', label="\\alpha=1")
plt.plot(lim, K_alpha(scores_dict[key]["tp"],
scores_dict[key]["tn"],
scores_dict[key]["fp"],
scores_dict[key]["fn"],
2),
'cx:', label="\\alpha=2")
plt.plot(lim, K_alpha(scores_dict[key]["tp"],
scores_dict[key]["tn"],
scores_dict[key]["fp"],
scores_dict[key]["fn"],
10),
'cd--', label="\\alpha=10")
plt.plot(lim, K_alpha(scores_dict[key]["tp"],
scores_dict[key]["tn"],
scores_dict[key]["fp"],
scores_dict[key]["fn"],
100),
'c*', label = "\\alpha=100")
plt.ylim([0, 1])
plt.xlabel("network confidence score")
plt.ylabel("K_{\\alpha}")
plt.legend()
os.chdir(home_path)
plt.savefig("f_k_{}_{}.eps".format(key, r), format="eps")
plt.savefig("f_k_{}_{}.jpg".format(key, r), format="jpg")
plt.close()
plt.title("summary for all keys")
for key in keys:
try:
c = legend_color[key]
n = legend_name[key]
except:
c = "yellow"
n = "unk"
plt.plot(lim, scores_dict[key]["f1"], label=n, color=c, marker='x')
plt.xlabel("network confidence score")
plt.ylabel("f1 score")
plt.ylim([0,1])
plt.legend()
os.chdir(home_path)
plt.savefig("f_f1_{}.eps".format(r), format="eps")
plt.savefig("f_f1_{}.jpg".format(r), format="jpg")
plt.close()
with open("all_scores" + ".json", "w") as f:
json.dump(all_scores, f)
# dont to it the last time (saves time, and there is no use to train once more)
if r < RETRAIN:
onlySave = False
else:
onlySave = True
model, dataset, NAME_r, CHECKPOINT = retrain(model,
dataset,
NAME_r,
CHECKPOINT,
EPOCHS,
end=end,
onlySave=onlySave,
console=console,
nb_used=nb_used)
return all_scores
if False:
print("{} Test on part CH dataset (test) {}".format("="*100, "="*100))
all_scores = generate(model, dataset, NAME, RETRAIN, CHECKPOINT, EPOCHS, all_scores)
-if True:
+if False:
+ print("{} Test on CH mixed dataset {}".format("="*100, "="*100))
+ dataset.load_data(dir="CH", end="_fr")
+
+ all_scores = generate(model, dataset, NAME, RETRAIN, CHECKPOINT, EPOCHS, all_scores, end="fr")
+
+if False:
print("{} Test on FR mixed dataset {}".format("="*100, "="*100))
dataset.load_data(dir="fr_mixed_full", end="_fr")
all_scores = generate(model, dataset, NAME, RETRAIN, CHECKPOINT, EPOCHS, all_scores, end="fr")
if False:
print("{} Test on FI mixed dataset {}".format("="*100, "="*100))
dataset.load_data(dir="fi_mixed_full", end="_fi")
all_scores = generate(model, dataset, NAME, RETRAIN, CHECKPOINT, EPOCHS, all_scores, end="fi")
if False:
print("{} Test on CH dataset {}".format("="*100, "="*100))
dataset.load_data(dir="ch_full")
all_scores = generate(model, dataset, NAME, RETRAIN, CHECKPOINT, EPOCHS, all_scores)
if False:
print("{} Test on FR dataset {}".format("="*100, "="*100))
dataset.load_data(dir="fr_full_surname", end="_fr")
all_scores = generate(model, dataset, NAME, RETRAIN, CHECKPOINT, EPOCHS, all_scores, end="fr")
if False:
print("{} Test on FI dataset {}".format("="*100, "="*100))
dataset.load_data(dir="fi_full_surname", end="_fi")
all_scores = generate(model, dataset, NAME, RETRAIN, CHECKPOINT, EPOCHS, all_scores, end="fi")
if False:
print("{} Test on FI dataset with Finnish names {}".format("="*100, "="*100))
dataset.load_data(dir="fi_full_surname_fi_names", end="_fi")
all_scores = generate(model, dataset, NAME, RETRAIN, CHECKPOINT, EPOCHS, all_scores, end="fi2")
print("{}\nSummary\n{}".format("="*100, "="*100))
for item in all_scores:
print(item)
\ No newline at end of file