diff --git a/main_project/gpt2_colab/gen_ch.py b/main_project/gpt2_colab/gen_ch.py index 69c22d1..36f31da 100644 --- a/main_project/gpt2_colab/gen_ch.py +++ b/main_project/gpt2_colab/gen_ch.py @@ -1,98 +1,104 @@ import random from gen_master import * class GenDataCH(GenData): def __init__(self, home_path): super().__init__(home_path) self.abrev = "ch" self.keys = ["surname", "givenName", "dateOfBirth", "height", "placeOfOrigin", "dateOfIssue", "dateOfExpiry", "sex", "identityCard", "nationality"] self.keys_all = ["surname", "givenName", "dateOfBirth", "height", "placeOfOrigin", "dateOfIssue", "dateOfExpiry", "sex", "identityCard", "nationality", "code"] self.schema = { "surname": "NAME - NOM - COGNOME - NUM - SURNAME", "givenName": "VORNAME(N) - PRENOMS(S) - NOME(I) - PRENUM(S) - GIVEN NAME(S)", "dateOfBirth": "GEBURTSDATUM - DATE DE NAISSANCE - DATA DI NASCITA - DATA DA NASCHIENTSCHA - DATE OF BIRTH", "height": "GRÖSSE - TAILLE - STATURA - GRONDEZZA - HEIGHT", "placeOfOrigin": "HEIMATORT - LIEU D'ORIGINE - LUOGO DI ATTINENZA - LIEU D'ORIGIN - PLACE OF ORIGIN", "authority": "BEHÖRDE - AUTORITÉ - AUTORITÀ - AUTORIDAD - AUTHORITY", "dateOfIssue": "AUSGESTELLT AM - DÉLIVERÉE LE - RILASCIATA IL - EMESSA ILS - DATE OF ISSUE", "dateOfExpiry": "GÜLTIG BIS - DATE D'EXPIRATION - DATA DI SCADENZA - DATA DA SCADENZA - DATE OF EXPIRY", "nationality": "NATIONALITÄT - NATIONALITÉ - CITTADINAZA - NAZIUNALITAD - NATIONALITY", "identityCard": "IDENTITÄTSKARTE - CARTE D'IDENTITÉ - CARTA DiDENTITÀ - CARTA D'IDENTITAD - IDENTITY CARD", "sex": "GESCHLECHT - SEE - SESSO - SCHLATTAINA - SEX", "code": "CODE" } # protected functions def _get_firstName(self): with open('data.json', 'r') as fp: data = json.load(fp) random.seed(42) # only keep first name, sometimes there are "Bob and Anna" first_names = [x.split(', ', 2)[1] for x in data["name"] if (", " in x)] first_names = [x for x in first_names if x is not None] first_names = [x.split(' ')[0] for x in first_names] return first_names def _get_lastName(self): with open('data.json', 'r') as fp: data = json.load(fp) random.seed(42) last_names = [x.split(', ', 2)[0] for x in data["name"] if (", " in x)] last_names = [x for x in last_names if x is not None] return last_names def _get_city(self): with open('data.json', 'r') as fp: data = json.load(fp) random.seed(42) place_of_origion = [x for x in data["location"] if (x is not None)] return place_of_origion def _gen_date_(self, d, m, y): return self._strnum(d) + " " + self._strnum(m) + " " + str(y)[2:4] def _gen_identityCard(self): abc = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" nb = abc[random.randint(0, len(abc) - 1)] nb += self._strnum(random.randint(100, 9999999), min_len=7) return nb - def _get_answer(self, key): + def _get_answer(self, key, set=None): # is not yet selfconsistent : age-height; dateofIssue-dateof Expiery if key == "surname": - return self.fake_data["last name"] + if set is None: + return self.fake_data["last name"] + return self.data_sets[set]["last name"] elif key == "givenName": - return self.fake_data["first name"] + if set is None: + return self.fake_data["first name"] + return self.data_sets[set]["first name"] elif key == "dateOfBirth": return self._gen_date(1940, 2020) elif key == "height": return [random.randint(100, 180)] elif key == "placeOfOrigin": - return self.fake_data["city"] + if set is None: + return self.fake_data["city"] + return self.data_sets[set]["city"] elif key == "dateOfIssue": return self._gen_date(2010, 2020) elif key == "dateOfExpiry": return self._gen_date(2020, 2030) elif key == "sex": return ["M", "F"] elif key == "identityCard": return [self._gen_identityCard()] elif key == "nationality": return ["Schweiz - Suisse - Svizzera - Svizra - Switzerland"] else: raise NotImplementedError("key not known") diff --git a/main_project/gpt2_colab/gen_de.py b/main_project/gpt2_colab/gen_de.py index 1545760..b3812e9 100644 --- a/main_project/gpt2_colab/gen_de.py +++ b/main_project/gpt2_colab/gen_de.py @@ -1,97 +1,101 @@ import random from gen_master import * class GenDataDE(GenData): def __init__(self, home_path): super().__init__(home_path) self.abrev = "de" self.keys = ["surname", "givenName", "dateOfBirth", "height", "dateOfIssue", "dateOfExpiry", "identityCard", "eyeColor", "placeOfBirth", "nationality"] self.keys_all = ["surname", "givenName", "dateOfBirth", "height", "authority", "dateOfIssue", "dateOfExpiry", "identityCard", "signature", "eyeColor", "placeOfBirth", "ArtistName", "address", "nationality"] self.schema = { "surname": "Name - Surname - Nom", "givenName": "Vornamen - Given names - Prénoms", "dateOfBirth": "Geburtstag - Date of birth - Date de naissance", "height": "Größe - Height - Taille", "authority": "Behörde - Authority - Autorité", "dateOfIssue": "Datum - Date - Date", "dateOfExpiry": "Gültig bis - Date of expiry - Date d'expiration", "identityCard": "PERSONALAUSWEIS - IDENTITY CARD - CARTE D'IDENTITE", "signature": "Unterschrit der Inhaberin/des Inhabers - Signature of bearer - Signature de la titulaire/du titulaire", "eyeColor": "Aufgenfarbe - Colour of eyes - Couleur des yeux", "placeOfBirth": "Geburtsort - Place of birth - Lieu de naissance", "ArtistName": "Ordens- oder Künstlername/Religious name or pseudonym/ Nom de religion ou pseudonyme", "address": "Anschrift - Adress - Adresse", "nationality": "Staatsangehörigkeit - Nationality - Nationalité", } # protected functions def _get_firstName(self): with open('data_germany.json', 'r') as fp: data = json.load(fp) random.seed(42) # only keep first name, sometimes there are "Bob and Anna" first_names = [x.split(', ', 2)[1] for x in data["name"] if (", " in x)] first_names = [x for x in first_names if x is not None] first_names = [x.split(' ')[0] for x in first_names] return first_names def _get_lastName(self): with open('data_germany.json', 'r') as fp: data = json.load(fp) random.seed(42) last_names = [x.split(', ', 2)[0] for x in data["name"] if (", " in x)] last_names = [x for x in last_names if x is not None] return last_names def _get_city(self): with open('data_geneva_city.json', 'r') as fp: data = json.load(fp) return data['region_de'] def _gen_date_(self, d, m, y): return self._strnum(d) + "." + self._strnum(m) + "." + self._strnum(y) def _gen_identityCard(self): abc = "ABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890123456789012345678901234567890123456789" nb = "" while len(nb) < 9: nb += abc[random.randint(0, len(abc) - 1)] return nb - def _get_answer(self, key): - ["surname", "givenName", "dateOfBirth", "height", "dateOfIssue", "dateOfExpiry", - "identityCard", "eyeColor", "placeOfBirth", "nationality"] + def _get_answer(self, key, set=None): # is not yet selfconsistent : age-height; dateofIssue-dateof Expiery if key == "surname": - return self.fake_data["last name"] + if set is None: + return self.fake_data["last name"] + return self.data_sets[set]["last name"] elif key == "givenName": - return self.fake_data["first name"] + if set is None: + return self.fake_data["first name"] + return self.data_sets[set]["first name"] elif key == "dateOfBirth": return self._gen_date(1940, 2020) elif key == "height": return [random.randint(100, 180)] elif key == "placeOfBirth": - return self.fake_data["city"] + if set is None: + return self.fake_data["city"] + return self.data_sets[set]["city"] elif key == "dateOfIssue": return self._gen_date(2010, 2020) elif key == "dateOfExpiry": return self._gen_date(2020, 2030) elif key == "identityCard": return [self._gen_identityCard()] elif key == "nationality": return ["DEUTSCH"] elif key == "eyeColor": return ["bernstein", "grün", "grünbraun", "grau", "blau", "hellbraun", "hellblau", "blaugrün"] else: raise NotImplementedError("key not known") diff --git a/main_project/gpt2_colab/gen_fake_data.py b/main_project/gpt2_colab/gen_fake_data.py index 084cd3c..c78bd05 100644 --- a/main_project/gpt2_colab/gen_fake_data.py +++ b/main_project/gpt2_colab/gen_fake_data.py @@ -1,32 +1,36 @@ from gen_master import * from gen_ch import * from gen_fi import * from gen_fr import * from gen_de import * from gen_it import * import os homepath = os.getcwd() CH = GenDataCH(homepath) FI = GenDataFI(homepath) FR = GenDataFR(homepath) DE = GenDataDE(homepath) IT = GenDataIT(homepath) countries = [CH, FI, FR, DE, IT] for c in countries: c.load_real_data() for c in countries: - c.run_fakeGen(250, countries) + c.run_fakeGen(2500, countries) + +for c in countries: + c.load_real_data() print("CH_part_"*10) -CH.fill_schema(nb_keys=3) +# CH.fill_schema(nb_keys=3) -for c in countries: + +for c in [DE, IT]: print() print("{}_full_".format(c._get_country()) * 10) - c.fill_schema(nb_keys="all") \ No newline at end of file + c.fill_schema(nb_keys="all", dir=c._get_country().upper()) diff --git a/main_project/gpt2_colab/gen_fi.py b/main_project/gpt2_colab/gen_fi.py index 97671dd..57b98ef 100644 --- a/main_project/gpt2_colab/gen_fi.py +++ b/main_project/gpt2_colab/gen_fi.py @@ -1,111 +1,115 @@ import random from gen_master import * class GenDataFI(GenData): def __init__(self, home_path): super().__init__(home_path) self.abrev = "fi" self.keys = ["surname", "givenName", "dateOfBirth", "dateOfIssue", "dateOfExpiry", "sex", "identityCard", "nationality"] self.keys_all = ["surname", "givenName", "dateOfBirth", "dateOfIssue", "dateOfExpiry", "sex", "identityCard", "nationality", "one", "two"] self.schema = { "surname": "SUKUNIMI - EFTERNAMN", "givenName": "ETUNIMET - FÖRNAMN", "dateOfBirth": "SYNTYMÄAIKA - FÖDELSEDATUM", "dateOfIssue": "MYÖNNETTY - UTFÄRDAT", "dateOfExpiry": "VOIMASSA - GILTIGT T.O.M.", "nationality": "KANSALAISUUS - NATIONALITET", "identityCard": "KORTTINUMERO - KORTNUMMER", "sex": "SUKUPUOLI - KÖN", "one": "TUNNUS - KOD", "two": "CAN" } # protected functions def _get_firstName(self): with open('data_fi.json', 'r') as fp: data = json.load(fp) first = [] last = [] for n in data['names']: tmp = n.split(" ") # if more then 2 spaces are used, it's most likely not a name if len(tmp) != 2: pass else: if len(tmp[0]) > 3: first.append(tmp[0]) if len(tmp[1]) > 3: last.append(tmp[1]) return first def _get_lastName(self): with open('data_fi.json', 'r') as fp: data = json.load(fp) first = [] last = [] for n in data['names']: tmp = n.split(" ") # if more then 2 spaces are used, it's most likely not a name if len(tmp) != 2: pass else: if len(tmp[0]) > 3: first.append(tmp[0]) if len(tmp[1]) > 3: last.append(tmp[1]) return last def _get_city(self): with open('data_fi.json', 'r') as fp: data = json.load(fp) locations_fi = data['addresses'] city_fi = [] for x in locations_fi: tmp = x.split(" ") tmp = tmp[-1].lower() tmp = tmp[0].upper() + tmp[1:] city_fi.append(tmp) with open('data_geneva_city.json', 'r') as fp: data = json.load(fp) city_fi += data['region_fi'] return city_fi def _gen_date_(self, d, m, y): return self._strnum(d) + "." + self._strnum(m) + "." + self._strnum(y) def _gen_identityCard(self): return random.randint(100000000, 999999999) - def _get_answer(self, key): + def _get_answer(self, key, set=None): # is not yet selfconsistent : age-height; dateofIssue-dateof Expiery if key == "surname": - return self.fake_data["last name"] + if set is None: + return self.fake_data["last name"] + return self.data_sets[set]["last name"] elif key == "givenName": - return self.fake_data["first name"] + if set is None: + return self.fake_data["first name"] + return self.data_sets[set]["first name"] elif key == "dateOfBirth": return self._gen_date(1940, 2020) elif key == "dateOfIssue": return self._gen_date(2010, 2020) elif key == "dateOfExpiry": return self._gen_date(2020, 2030) elif key == "sex": return ["M", "F"] elif key == "identityCard": return [self._gen_identityCard()] elif key == "nationality": return ["FIN"] else: raise NotImplementedError("key not known") \ No newline at end of file diff --git a/main_project/gpt2_colab/gen_fr.py b/main_project/gpt2_colab/gen_fr.py index a6c7c05..8c51d56 100644 --- a/main_project/gpt2_colab/gen_fr.py +++ b/main_project/gpt2_colab/gen_fr.py @@ -1,91 +1,97 @@ import random from gen_master import * class GenDataFR(GenData): def __init__(self, home_path): super().__init__(home_path) self.abrev = "fr" self.keys = ["surname", "givenName", "dateOfBirth", "height", "placeOfBirth", "dateOfIssue", "dateOfExpiry", "sex", "identityCard", "nationality"] self.keys_all = ["surname", "givenName", "dateOfBirth", "height", "placeOfBirth", "dateOfIssue", "dateOfExpiry", "sex", "identityCard", "nationality"] self.schema = { "surname": "Nom", "givenName": "Prénom(s)", "dateOfBirth": "Né(e) le", "height": "Taille", "placeOfBirth": "à", "authority": "par", "dateOfIssue": "délivrée le", "dateOfExpiry": "Carte valable jusqu'au", "nationality": "Nationalité", "identityCard": "CARTE NATIONALE D'IDENTITÉ N°", "sex": "Sexe", "signature": "Signature du titulaire", "signature2": "Signature de l'autorité", "address": "Adresse" } # protected functions def _get_firstName(self): with open('data_geneva.json', 'r') as fp: data = json.load(fp) random.seed(42) # only keep first name, sometimes there are "Bob and Anna" first_names = [x.split(', ', 2)[1] for x in data["name"] if (", " in x)] first_names = [x for x in first_names if x is not None] first_names = [x.split(' ')[0] for x in first_names] return first_names def _get_lastName(self): with open('data_geneva.json', 'r') as fp: data = json.load(fp) random.seed(42) last_names = [x.split(', ', 2)[0] for x in data["name"] if (", " in x)] last_names = [x for x in last_names if x is not None] return last_names def _get_city(self): with open('data_geneva_city.json', 'r') as fp: data = json.load(fp) return data['region'] def _gen_date_(self, d, m, y): return self._strnum(d) + "." + self._strnum(m) + "." + self._strnum(y) def _gen_identityCard(self): return random.randint(100000000000, 999999999999) - def _get_answer(self, key): + def _get_answer(self, key, set=None): # is not yet selfconsistent : age-height; dateofIssue-dateof Expiery if key == "surname": - return self.fake_data["last name"] + if set is None: + return self.fake_data["last name"] + return self.data_sets[set]["last name"] elif key == "givenName": - return self.fake_data["first name"] + if set is None: + return self.fake_data["first name"] + return self.data_sets[set]["first name"] elif key == "dateOfBirth": return self._gen_date(1940, 2020) elif key == "height": return [random.randint(100, 180)] elif key == "placeOfBirth": - return self.fake_data["city"] + if set is None: + return self.fake_data["city"] + return self.data_sets[set]["city"] elif key == "dateOfIssue": return self._gen_date(2010, 2020) elif key == "dateOfExpiry": return self._gen_date(2020, 2030) elif key == "sex": return ["M", "F"] elif key == "identityCard": return [self._gen_identityCard()] elif key == "nationality": return ["Française"] else: raise NotImplementedError("key not known") \ No newline at end of file diff --git a/main_project/gpt2_colab/gen_it.py b/main_project/gpt2_colab/gen_it.py index 523ef77..113efe9 100644 --- a/main_project/gpt2_colab/gen_it.py +++ b/main_project/gpt2_colab/gen_it.py @@ -1,140 +1,146 @@ import random from gen_master import * import copy class GenDataIT(GenData): def __init__(self, home_path): super().__init__(home_path) self.abrev = "it" self.keys = ["surname", "givenName", "dateOfBirth", "height", "sex", "dateOfIssue", "dateOfExpiry", "nationality", "identityCard", "placeOfBirth"] self.keys_all = ["surname", "givenName", "dateOfBirth", "height", "sex", "authority", "dateOfIssue", "dateOfExpiry", "nationality", "identityCard", "signature", "parents", "fiscalCode", "birthCode", "address"] self.schema = { "surname": "COGNOME - SURNAME", "givenName": "NOME - NAME", "dateOfBirth": "LUOGO E DATA DI NASCITA - PLACE AND DATE OF BIRTH", "height": "STATURA - HEIGHT", "sex": "SESSO - SEX", "authority": "COMUNE DI - MUNICIPALITY", "dateOfIssue": "EMISSIONE - ISSUING", "dateOfExpiry": "SCADENZA - EXPIRY", "nationality": "CITTADINANZA - NATIONALITY", "identityCard": "CARTA DI IDENTITA - IDENTITY CARD", "signature": "FIRMA DEL TITOLARE - HOLDER'S SIGNATURE", "parents": "COGNOME E NOME DE GENITORI O DI CHI NE FA LE VECI - SURNAME AND NAME OF PARENTS OR LEGAL GUARDIAN", "fiscalCode": "CODICE FISCALE - FISCAL CODE", "birthCode": "ESTREMAKATTO DI NASCITA", "address": "INDIRIZZO DI RESIDENZA - RESIDENCE", } # protected functions def _get_firstName(self): with open('data_italy.json', 'r') as fp: data = json.load(fp) random.seed(42) # only keep first name, sometimes there are "Bob and Anna" first_names = [x.split(', ', 2)[1] for x in data["name"] if (", " in x)] first_names = [x for x in first_names if x is not None] first_names = [x.split(' ')[0] for x in first_names] return first_names def _get_lastName(self): with open('data_italy.json', 'r') as fp: data = json.load(fp) random.seed(42) last_names = [x.split(', ', 2)[0] for x in data["name"] if (", " in x)] last_names = [x for x in last_names if x is not None] return last_names def _get_city(self): with open('data_geneva_city.json', 'r') as fp: data = json.load(fp) return data['region_it'] def _gen_date_(self, d, m, y): return self._strnum(d) + "." + self._strnum(m) + "." + self._strnum(y) def _gen_identityCard(self): abc = "ABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890123456789012345678901234567890123456789" nb = "" while len(nb) < 9: nb += abc[random.randint(0, len(abc) - 1)] return nb - def _get_answer(self, key): + def _get_answer(self, key, set=None): # is not yet selfconsistent : age-height; dateofIssue-dateof Expiery if key == "surname": - return self.fake_data["last name"] + if set is None: + return self.fake_data["last name"] + return self.data_sets[set]["last name"] elif key == "givenName": - return self.fake_data["first name"] + if set is None: + return self.fake_data["first name"] + return self.data_sets[set]["first name"] elif key == "dateOfBirth": - return self.fake_data["city"], self._gen_date(1940, 2020) + if set is None: + return self.fake_data["city"], self._gen_date(1940, 2020) + return self.data_sets[set]["city"], self._gen_date(1940, 2020) elif key == "height": return [random.randint(100, 180)] elif key == "dateOfIssue": return self._gen_date(2010, 2020) elif key == "dateOfExpiry": return self._gen_date(2020, 2030) elif key == "sex": return ["M", "F"] elif key == "identityCard": return [self._gen_identityCard()] elif key == "nationality": return ["ITA"] else: raise NotImplementedError("key not known") def _fill_schema(self, question, nb_keys="all"): # special case of italy, because place of birth and date of birth are a combined label tmp = copy.deepcopy(self.keys) tmp.remove("placeOfBirth") if nb_keys != "all": random.shuffle(tmp) tmp = tmp[:nb_keys] if question in tmp: pass elif question == "placeOfBirth" and "dateOfBirth" in tmp: pass elif question == "placeOfBirth": tmp[random.randint(0, len(tmp) - 1)] = "dateOfBirth" else: # replace a random entry tmp[random.randint(0, len(tmp) - 1)] = question context = "" answer = "" for key in tmp: if key != "dateOfBirth": ans = self._get_answer(key) ans = ans[random.randint(0, len(ans)-1)] if key == question: answer = ans else: ans1, ans2 = self._get_answer(key) ans1 = ans1[random.randint(0, len(ans1))] ans2 = ans2[random.randint(0, len(ans2))] ans = str(ans1) + " " + str(ans2) if question == "dateOfBirth": answer = ans2 elif question == "placeOfBirth": answer = ans1 context += self.schema[key] + ": " + str(ans) + "; " return context, question, answer diff --git a/main_project/gpt2_colab/gen_master.py b/main_project/gpt2_colab/gen_master.py index 886e70d..d36d960 100644 --- a/main_project/gpt2_colab/gen_master.py +++ b/main_project/gpt2_colab/gen_master.py @@ -1,371 +1,477 @@ import json import os import random import copy from abc import abstractmethod from transformers import TFGPT2LMHeadModel from transformers import AutoTokenizer, AutoModelForQuestionAnswering class GenData: def __init__(self, home_path): self.home_path = home_path + os.chdir(self.home_path) + os.chdir("./datasets") + self.data_path = os.getcwd() + os.chdir(self.home_path) os.chdir("./transformers/examples/") os.chdir("./language-modeling") self.finetune_path = os.getcwd() self.nbEpochs = 5 self.outModelName = "genFakeData5" os.chdir(self.home_path) os.chdir("./models") self.model_path = os.getcwd() os.chdir(self.home_path) os.chdir("./crawl_data") self.crawl_path = os.getcwd() self.data = {"first name": [], "last name": [], "city": []} self.fake_data = {"first name": [], "last name": [], "city": []} self.model = None self.tokenizer = None self.keys = None self.keys_all = None self.schema = None self.abrev = None + self.tt = ["train", "test1", "test2", "test3", "test4"] + + self.data_sets = {} + for t in self.tt: + self.data_sets[t] = {"first name": [], "last name": [], "city": []} + + self.tt_s = [0, 1500, 250, 250, 250, 250] + + # starting index + for i, _ in enumerate(self.tt_s[:-1]): + self.tt_s[i+1] += self.tt_s[i] + + # starting index as fraction of total length + for i, _ in enumerate(self.tt_s): + self.tt_s[i] = self.tt_s[i]/self.tt_s[-1] + # Private methods # protected functions def _strnum(self, i, min_len=2): i = str(i) while (len(i) < min_len): i = "0" + i return i @abstractmethod def _get_firstName(self): raise NotImplementedError("abstract method") @abstractmethod def _get_lastName(self): raise NotImplementedError("abstract method") @abstractmethod def _get_city(self): raise NotImplementedError("abstract method") @abstractmethod def _get_answer(self, key): raise NotImplementedError("abstract method") - def _fill_schema(self, question, nb_keys="all"): + def _fill_schema(self, question, nb_keys="all", set_=None): tmp = copy.deepcopy(self.keys) if nb_keys != "all": random.shuffle(tmp) tmp = tmp[:nb_keys] if question in tmp: pass else: # replace a random entry tmp[random.randint(0, len(tmp) - 1)] = question context = "" answer = "" for key in tmp: - ans = self._get_answer(key) + ans = self._get_answer(key, set=set_) ans = ans[random.randint(0, len(ans)-1)] context += self.schema[key] + ": " + str(ans) + "; " if key == question: answer = ans - return context, question, answer + return context, question, str(answer) def __doubleCapital(self, word): for i, l in enumerate(word[:-1]): if l.isupper() and word[i+1].isupper() and l.lower() in "abcdefghijklmnopqrstuvwxyz" \ and word[i+1].lower() in "abcdefghijklmnopqrstuvwxyz": return True + if i>0 and l.isupper() and word[i-1] not in [" ", ".", "'"]: + return True return False - def _clean_name(self, dirty, countries=[]): # collect all city names city_names = [] for c in countries: data, _ = c._get_data() city_names += data["city"] clean = [] for d in dirty: # Name contains spaces and is not "van ..." if " " in d and "van" not in d: pass # first letter is lowercase or "." is in word, or it's just two letters elif d[0].upper() != d[0] or "." in d or len(d) <= 2 or "(" in d or ")" in d: pass + elif d[0].lower() not in "abcdefghijklmnopqrstuvwxyz": + pass # contains numbers elif any(char.isdigit() for char in d): pass # everything is in upper case elif all(char.isupper() for char in d): pass # specific for finnish crawl elif "A-Z" in d or "oy" == d.lower() or "studio" in d.lower(): pass elif ">" in d or "<" in d or "/" in d: pass elif self.__doubleCapital(d): pass elif d in city_names: pass else: clean.append(d) return clean - def _clean_city(self, dirty, countries=[]): + def _clean_city(self, dirty, countries=[], countries2=[]): # collect all city names other_city_names = [] for c in countries: data, _ = c._get_data() other_city_names += data["city"] + names = [] + for c in countries: + data, _ = c._get_data() + names += data["first name"] + names += data["last name"] + + for c in countries2: + data, _ = c._get_data() + names += data["first name"] + names += data["last name"] + clean = [] for d in dirty: # first letter is lowercase, or it's just two letters if d[0].upper() != d[0] or len(d) <= 2 or "(" in d or ")" in d: pass + # first letter is lowercase or it's just two letters + elif d[0].upper() != d[0] or len(d) <= 2: + pass + elif d[0].lower() not in "abcdefghijklmnopqrstuvwxyz": + pass # contains numbers elif any(char.isdigit() for char in d): pass # everything is in upper case elif all(char.isupper() for char in d): pass elif ">" in d or "<" in d or "/" in d: pass elif self.__doubleCapital(d): pass elif d in other_city_names: pass + # dont accept persons names + elif d in names: + pass else: clean.append(d) return clean @abstractmethod def _gen_date_(self, d, m, y): raise NotImplementedError("abstract method") @abstractmethod def _gen_identityCard(self): raise NotImplementedError("abstract method") def _gen_date(self, from_y, to_y): # year should be given in the format 2012 (4 digits) dates = [] for d in range(1, 32): for m in range(1, 13): for y in range(from_y + 1, to_y + 1): dates.append(self._gen_date_(d, m, y)) return dates def _gpt_train_entry(self, ident, nat, list): keywords = ["", "", "", "<|endoftext|>"] out = [] for l in list: out.append(keywords[0] + str(list[:10]) + "<{}><{}>".format(ident, nat) + l + keywords[3]) return out def _gpt_save(self, list): long_str = "" for x in list: try: long_str += x + "\n" except: pass import io with io.open("gen_data_train.txt", 'w', encoding="utf-8") as f: # with open("gen_data_train.txt", 'w') as f: f.write(long_str) def _get_data(self): return self.data, self.fake_data def _set_data(self, data, fake_data): if data is not None: self.data = data if fake_data is not None: self.fake_data = fake_data def _get_country(self): return self.abrev def _fake_gen(self, ident, nat, list): keywords = ["", "", "", "<|endoftext|>"] - prompt = keywords[0] + str(list[:10]) + "<{}><{}>".format(ident, nat) # + l + keywords[3] + prompt = keywords[0] + str(list) + "<{}><{}>".format(ident, nat) # + l + keywords[3] input_ids = self.tokenizer.encode(prompt, return_tensors='tf') generated_text_samples = self.model.generate( input_ids, max_length=len(input_ids[0]) + 50, num_return_sequences=1, - no_repeat_ngram_size=0, - repetition_penalty=1.0, + no_repeat_ngram_size=3, + repetition_penalty=1.2, top_p=1.0, - temperature=1.0, + temperature=1.5, do_sample=True, top_k=0, early_stopping=True ) answer = self.tokenizer.decode(generated_text_samples[0]) answer = answer.replace(prompt, "") answer = answer.replace("<|endoftext|>", "") return answer # Public methods def rm_dub(self, dirty): # remove dublicates return list(set(dirty)) def load_real_data(self): os.chdir(self.crawl_path) self.data["first name"] = self._clean_name(self.rm_dub(self._get_firstName())) self.data["last name"] = self._clean_name(self.rm_dub(self._get_lastName())) self.data["city"] = self._clean_city(self.rm_dub(self._get_city())) + try: + for t in self.tt: + filename = "fake_id_data_" + self.abrev + "_" + t + ".json" + print(filename) + with open(filename, 'r') as f: + self.data_sets[t] = json.load(f) + except: + print("couldn't load test and train sets") + def train_fakeGen(self, countries): # countries is a list of child classes train_data = [] for ident in self.data.keys(): for country in countries: data, _ = country._get_data() random.shuffle(data) if len(data) > 5000: data = data[:5000] else: print("Warning, label <{}> in nationality <{}> only has {} entries".format( ident, country._get_country(), len(data))) train_data.append(self._gpt_train_entry(ident, country._get_country(), data)) random.shuffle(train_data) self._gpt_save(train_data) os.chdir(self.finetune_path) with open("real_id_data" + ".json", "w") as f: json.dump(data, f) cmd = "python run_clm.py \ --model_type {} \ --train_file \"{}\" \ --do_train \ --per_gpu_train_batch_size 1 \ --save_steps -1 \ --num_train_epochs {} \ --fp16 \ --tokenizer_name gpt2 \ --model_name_or_path gpt2 \ --output_dir=\"{}\" \ ".format( "gpt2", "gen_data_train" + ".txt", self.nbEpochs, self.model_path + "/" + self.outModelName) print(cmd) os.system(cmd) def run_fakeGen(self, nb, countries): os.chdir(self.home_path) self.model = TFGPT2LMHeadModel.from_pretrained(self.model_path + "/" + self.outModelName, from_pt=True) self.tokenizer = AutoTokenizer.from_pretrained(self.model_path + "/" + self.outModelName) os.chdir(self.crawl_path) #try: f = 'fake_id_data_'+ self.abrev +'.json' print(f) with open(f, 'r') as fp: fake_data = json.load(fp) # apply new cleaning rules for key in fake_data.keys(): if key in ["first name", "last name"]: fake_data[key] = self._clean_name(fake_data[key], countries) if key in ["city"]: - fake_data[key] = self._clean_city(fake_data[key], [c for c in countries if c._get_country() != self.abrev]) + fake_data[key] = self._clean_city(fake_data[key], + [c for c in countries if c._get_country() != self.abrev], + [c for c in countries if c._get_country() == self.abrev]) self.fake_data = fake_data print("{}:\n# first names: {}, # last names: {}, # cities: {}".format(self.abrev, len(fake_data["first name"]), len(fake_data["last name"]), len(fake_data["city"]))) #except: # print("couldn't find existing fake data entries") for ident in self.data.keys(): print("{}, {}".format(ident, self.abrev)) while len(self.fake_data[ident]) < nb: data, fake_data = self._get_data() - tmp = self._fake_gen(ident, self.abrev, data[ident]) + # tmp = self._fake_gen(ident, self.abrev, data[ident]) + if len(self.fake_data[ident]) < 100: + tmp = self._fake_gen(ident, self.abrev, data[ident][:10]) + else: + sample_fake = copy.deepcopy(self.fake_data[ident]) + random.shuffle(sample_fake) + + tmp = self._fake_gen(ident, self.abrev, sample_fake[:10]+data[ident][:10]) # clean names try: if ident in ["first name", "last name"]: tmp = self._clean_name([tmp], countries)[0] if ident in ["city"]: - tmp = self._clean_city([tmp], [c for c in countries if c._get_country() != self.abrev])[0] + tmp = self._clean_city([tmp], + [c for c in countries if c._get_country() != self.abrev], + [c for c in countries if c._get_country() == self.abrev])[0] fake_data[ident].append(tmp) except: # if the clean function returns an empty list pass fake_data[ident] = self.rm_dub(fake_data[ident]) self.fake_data = fake_data new = [tmp for tmp in fake_data[ident] if tmp not in data[ident]] print("There have been {} new generation, and {} identical to the test set".format(len(new), len( fake_data[ident]) - len(new))) data, fake_data = self._get_data() new = [tmp for tmp in fake_data[ident] if tmp not in data[ident]] print(new) print("There have been {} new generation, and {} identical to the test set".format(len(new), len( fake_data[ident]) - len(new))) print("{}, {} -- ended search".format(ident, self.abrev)) os.chdir(self.crawl_path) with open("fake_id_data_"+ self.abrev + ".json", "w") as f: json.dump(fake_data, f) - def fill_schema(self, keys="all", nb_keys="all"): + # split into train and test set + + os.chdir(self.crawl_path) + + shuffled_data = copy.deepcopy(fake_data) + for key in shuffled_data.keys(): + random.shuffle(shuffled_data[key]) + + for i, t in enumerate(self.tt): + fake_data_subset = {"first name": [], "last name": [], "city": []} + for key in fake_data_subset.keys(): + l = len(shuffled_data[key]) + subset = shuffled_data[key][int(self.tt_s[i]*l):int(self.tt_s[i+1]*l)] + fake_data_subset[key] = subset + + with open("fake_id_data_" + self.abrev + "_" + t + ".json", "w") as f: + json.dump(fake_data_subset, f) + + def fill_schema(self, keys="all", nb_keys="all", dir="myDataset"): if keys == "all": keys = self.keys - for k in keys: - context, question, answer = self._fill_schema(k, nb_keys=nb_keys) - print(context) - print(question) - print(answer) \ No newline at end of file + for t in self.tt: + x = [] + y = [] + + qa_set = {"data": {"question": [], "context": [], "answers": []}} + # generate a 1000 examples for each question + for _ in range(1000): + for k in keys: + context, question, answer = self._fill_schema(k, nb_keys=nb_keys, set_=t) + # print(context) + # print(question) + # print(answer) + + keywords = ["", "", "", "<|endoftext|>"] + x.append(keywords[0] + context + keywords[2] + question + keywords[1]) + y.append(answer + keywords[3]) + + qa_set["data"]["question"].append(question) + qa_set["data"]["context"].append(context) + qa_set["data"]["answers"].append({"answer_start": [context.find(answer)], "text": [answer]}) + + os.chdir(self.data_path) + if not os.path.exists(dir): + os.mkdir(dir) + os.chdir(dir) + with open("x_" + t + ".json", "w") as f: + json.dump(x, f) + with open("y_" + t + ".json", "w") as f: + json.dump(y, f) + with open("qa_" + t + ".json", "w") as f: + json.dump(qa_set, f) \ No newline at end of file diff --git a/main_project/gpt2_colab/main.py b/main_project/gpt2_colab/main.py index 2b0a4e8..2e01578 100644 --- a/main_project/gpt2_colab/main.py +++ b/main_project/gpt2_colab/main.py @@ -1,331 +1,338 @@ # score per question category # entropy per question (if it's always the same, don't accept it? Prolem with nationality i.e.) import os import load_files as lf import model as mo import json import matplotlib.pyplot as plt # choose the model type MODEL = "gpt" # gpt, bert # train the model from scratch or from a checkpoint SCRATCH = False # True, False # choose the checkpoint -CHECKPOINT = "gpt2-large" # xlm-roberta-base, gpt2, None, roberta-base, "gpt2-medium", "gpt2-large", "gpt2-xl" +CHECKPOINT = "gpt2" # xlm-roberta-base, gpt2, None, roberta-base, "gpt2-medium", "gpt2-large", "gpt2-xl" # train the model in this run TRAIN = True # True, False # number of training epochs EPOCHS = 1 # new model name -NAME = "gpt_L_e_1_test" # xmlr_e_10_test, gpt_e_1_test +NAME = "gpt_e1_CHpart" # xmlr_e_10_test, gpt_e_1_test # probaModes to be tested PROBA = ["longOk"] #, "longOk", ["mult", "forceNon0", "maxNon0"], ["mult", "longOk"] # number of times the model should be retrained (0 is never) RETRAIN = 0 home_path = os.getcwd() # load a dataset if MODEL == "gpt": print("Start instances for a gpt2 model") dataset = lf.DatasetGPT(home_path) model = mo.GPTModel(home_path, printStep=-1) elif MODEL == "bert": print("Start instances for a bert model") dataset = lf.DatasetBert(home_path) model = mo.BertModel(home_path, printStep=-1) else: NotImplementedError("Model type not defined") # load a dataset dataset.load_data() +# dataset.load_data(dir="fi_mixed_full", end="_fi") # train a tokenizer from scratch if SCRATCH: NotImplementedError("Train a tokenizer") SCRATCH = "loc of tok" else: SCRATCH = None if TRAIN: model.train(nbEpochs=EPOCHS, outModelName=NAME, startCheckpoint=CHECKPOINT, dataEnd="", tokenizerLocaction=SCRATCH) # load the model model.load_model(NAME) all_scores = [] def retrain(model, dataset, NAME, CHECKPOINT, EPOCHS, nb_used=100, end="", onlySave=True, console=""): xsure, ysure = model.getSureGuesses() xtest, ytest = dataset.get_test() if len(xsure) == 0: print("{}\n{}\n{}\n{}\n{}".format("="*50, "="*50, "there are no sure guesses...", "="*50, "="*50)) return model, dataset, NAME, CHECKPOINT while len(xsure) < 100: xsure = xsure + xsure ysure = ysure + ysure dataset.save_data(xsure, ysure, NAME, x_test=xtest[nb_used:], y_test=ytest[nb_used:], console=console, ) if not onlySave: CHECKPOINT = NAME NAME += "_adapt_" + end dataset.load_data(dir=CHECKPOINT, end="") if end != "": end = "_" + end model.train(nbEpochs=EPOCHS, outModelName=NAME, startCheckpoint=CHECKPOINT, tokenizerLocaction=CHECKPOINT, dataEnd="", ) # load the model model.load_model(NAME) return model, dataset, NAME, CHECKPOINT def K_alpha(tp, tn, fp, fn, alpha): K = [] for i, _ in enumerate(tp): K.append(tp[i]/(tp[i]+fp[i]*alpha+tn[i]+fn[i])) return K def generate(model, dataset, NAME, RETRAIN, CHECKPOINT, EPOCHS, all_scores, end = "", nb_used=1000): legend_color = { "all": "black", "sex": "goldenrod", "height": "red", "dateOfBirth": "plum", "dateOfExpiry": "fuchsia", "dateOfIssue": "deeppink", "placeOfBirth": "darkgreen", "surname": "lawngreen", "givenName": "mediumseagreen", "placeOfOrigin": "darkslategrey", } legend_name = { "all": "all", "sex": "sex", "height": "height", "dateOfBirth": "date of birth", "dateOfExpiry": "date of expiry", "dateOfIssue": "date of issue", "placeOfBirth": "place of birth", "surname": "last name", "givenName": "first name", "placeOfOrigin": "place of origin", } plt.rcParams.update({"text.usetex": True}) for m in PROBA: model.load_model(NAME) model.set_proba_mode(m) NAME_r = NAME for r in range(RETRAIN + 1): scores_dict, scores, console = model.generate(dataset.get_test(), 0, nb_used) all_scores.append(scores) # print all scores keys = scores_dict.keys() buckets = len(scores_dict["all"]["count"]) lim = [x/buckets for x in range(buckets)] for key in keys: plt.title(key) plt.plot(lim, scores_dict[key]["tp"], 'g+-', label="tp") plt.plot(lim, scores_dict[key]["tn"], 'go--', label="tn") plt.plot(lim, scores_dict[key]["fp"], 'r+-', label="fp") plt.plot(lim, scores_dict[key]["fn"], 'ro--', label="fn") tmp = [150, 500, 1000, 2000, 5000] for upper in tmp: if scores_dict[key]["tp"][0]+scores_dict[key]["fp"][0] < upper: break plt.ylim([0, upper]) plt.xlabel("network confidence score") plt.ylabel("# examples") plt.legend() os.chdir(home_path) plt.savefig("f_tp_{}_{}.eps".format(key, r), format="eps") plt.savefig("f_tp_{}_{}.jpg".format(key, r), format="jpg") plt.close() # --------------- plt.title(key) plt.plot(lim, scores_dict[key]["f1"], 'go-', label="F1") plt.plot(lim, scores_dict[key]["recall"], 'bx:', label="recall") plt.plot(lim, scores_dict[key]["precision"], 'md--', label="precision") plt.ylim([0,1]) plt.xlabel("network confidence score") plt.ylabel("score") plt.legend() os.chdir(home_path) plt.savefig("f_sc_{}_{}.eps".format(key, r), format="eps") plt.savefig("f_sc_{}_{}.jpg".format(key, r), format="jpg") plt.close() # ---------------- plt.title(key) plt.plot(lim, scores_dict[key]["count"], "ko-") tmp = [150, 500, 1000, 2000, 5000] for upper in tmp: if scores_dict[key]["count"][0] < upper: break plt.ylim([0, upper]) plt.xlabel("network confidence score") plt.ylabel("# examples") os.chdir(home_path) plt.savefig("f_conf_{}_{}.eps".format(key, r), format="eps") plt.savefig("f_conf_{}_{}.jpg".format(key, r), format="jpg") plt.close() # ---------------- plt.title(key) plt.plot(lim, K_alpha(scores_dict[key]["tp"], scores_dict[key]["tn"], scores_dict[key]["fp"], scores_dict[key]["fn"], 1), 'co-', label="\\alpha=1") plt.plot(lim, K_alpha(scores_dict[key]["tp"], scores_dict[key]["tn"], scores_dict[key]["fp"], scores_dict[key]["fn"], 2), 'cx:', label="\\alpha=2") plt.plot(lim, K_alpha(scores_dict[key]["tp"], scores_dict[key]["tn"], scores_dict[key]["fp"], scores_dict[key]["fn"], 10), 'cd--', label="\\alpha=10") plt.plot(lim, K_alpha(scores_dict[key]["tp"], scores_dict[key]["tn"], scores_dict[key]["fp"], scores_dict[key]["fn"], 100), 'c*', label = "\\alpha=100") plt.ylim([0, 1]) plt.xlabel("network confidence score") plt.ylabel("K_{\\alpha}") plt.legend() os.chdir(home_path) plt.savefig("f_k_{}_{}.eps".format(key, r), format="eps") plt.savefig("f_k_{}_{}.jpg".format(key, r), format="jpg") plt.close() plt.title("summary for all keys") for key in keys: try: c = legend_color[key] n = legend_name[key] except: c = "yellow" n = "unk" plt.plot(lim, scores_dict[key]["f1"], label=n, color=c, marker='x') plt.xlabel("network confidence score") plt.ylabel("f1 score") plt.ylim([0,1]) plt.legend() os.chdir(home_path) plt.savefig("f_f1_{}.eps".format(r), format="eps") plt.savefig("f_f1_{}.jpg".format(r), format="jpg") plt.close() with open("all_scores" + ".json", "w") as f: json.dump(all_scores, f) # dont to it the last time (saves time, and there is no use to train once more) if r < RETRAIN: onlySave = False else: onlySave = True model, dataset, NAME_r, CHECKPOINT = retrain(model, dataset, NAME_r, CHECKPOINT, EPOCHS, end=end, onlySave=onlySave, console=console, nb_used=nb_used) return all_scores if False: print("{} Test on part CH dataset (test) {}".format("="*100, "="*100)) all_scores = generate(model, dataset, NAME, RETRAIN, CHECKPOINT, EPOCHS, all_scores) -if True: +if False: + print("{} Test on CH mixed dataset {}".format("="*100, "="*100)) + dataset.load_data(dir="CH", end="_fr") + + all_scores = generate(model, dataset, NAME, RETRAIN, CHECKPOINT, EPOCHS, all_scores, end="fr") + +if False: print("{} Test on FR mixed dataset {}".format("="*100, "="*100)) dataset.load_data(dir="fr_mixed_full", end="_fr") all_scores = generate(model, dataset, NAME, RETRAIN, CHECKPOINT, EPOCHS, all_scores, end="fr") if False: print("{} Test on FI mixed dataset {}".format("="*100, "="*100)) dataset.load_data(dir="fi_mixed_full", end="_fi") all_scores = generate(model, dataset, NAME, RETRAIN, CHECKPOINT, EPOCHS, all_scores, end="fi") if False: print("{} Test on CH dataset {}".format("="*100, "="*100)) dataset.load_data(dir="ch_full") all_scores = generate(model, dataset, NAME, RETRAIN, CHECKPOINT, EPOCHS, all_scores) if False: print("{} Test on FR dataset {}".format("="*100, "="*100)) dataset.load_data(dir="fr_full_surname", end="_fr") all_scores = generate(model, dataset, NAME, RETRAIN, CHECKPOINT, EPOCHS, all_scores, end="fr") if False: print("{} Test on FI dataset {}".format("="*100, "="*100)) dataset.load_data(dir="fi_full_surname", end="_fi") all_scores = generate(model, dataset, NAME, RETRAIN, CHECKPOINT, EPOCHS, all_scores, end="fi") if False: print("{} Test on FI dataset with Finnish names {}".format("="*100, "="*100)) dataset.load_data(dir="fi_full_surname_fi_names", end="_fi") all_scores = generate(model, dataset, NAME, RETRAIN, CHECKPOINT, EPOCHS, all_scores, end="fi2") print("{}\nSummary\n{}".format("="*100, "="*100)) for item in all_scores: print(item) \ No newline at end of file