Commit a9bbcdd7 authored by Julian Rogawski's avatar Julian Rogawski
Browse files

machine learning added

parent ec76bb6d
......@@ -121,7 +121,6 @@ dependencies:
- sqlite=3.24.0=h84994c4_0
- statsmodels=0.9.0=py36h035aef0_0
- tensorboard=1.8.0=py36hf484d3e_0
- tensorflow=1.8.0=h57681fa_0
- tensorflow-base=1.8.0=py36h5f64886_0
- termcolor=1.1.0=py36_1
- thinc=6.11.2=py36hedc7406_1
......@@ -160,9 +159,13 @@ dependencies:
- pandocfilters==1.4.2
- prometheus-client==0.3.0
- qtconsole==4.3.1
- scikit-learn==0.19.2
- send2trash==1.5.0
- sklearn==0.0
- tensorflow==1.8.0
- terminado==0.8.1
- testpath==0.3.1
- widgetsnbextension==3.3.0
- xx-ent-wiki-sm==2.0.0
prefix: /home/julian/miniconda3/envs/binder_env
......@@ -14,6 +14,8 @@ class NameFinder:
:param text: The text in which this functions searches for names.
:return: list of found person names.
"""
if text == "":
return []
doc = self.nlp(text)
raw_persons = [entity.text for entity in doc.ents if entity.label_ == 'PER']
......
......@@ -2,8 +2,8 @@ from typing import List
import gender_ai
from components.website.conference_person import ConferencePerson
from components.website.conference_single_page import ConferenceSinglePage
from src.julian.components.website.conference_person import ConferencePerson
from src.julian.components.website.conference_single_page import ConferenceSinglePage
class Conference:
......
import json
from src.julian.analysis.utility import UtilityFunctions
from src.julian.handler.conference_data_handler import ConferenceDataHandler
from src.julian.handler.website_page_data_parser import ConferenceWebsitePageParser
import gender_ai
import gender_guesser.detector as gender
from analysis.utility import UtilityFunctions
from handler.conference_data_handler import ConferenceDataHandler
from handler.website_page_data_parser import ConferenceWebsitePageParser
# parse the data files
print("Read conference data...")
conferences = ConferenceDataHandler.parse("data/conferences.json")
conferences = ConferenceDataHandler.parse("src/julian/data/conferences.json")
filter_words = ConferenceDataHandler.get_filter_words(conferences)
filter_words.extend(["biography", "department", "university", "–", "abstract", "keynote", "bio"])
print("Read speaker website data...")
website_parser = ConferenceWebsitePageParser(conferences, filter_words)
website_parser.parse("data/pages.json")
website_parser.parse("src/julian/data/pages.json")
# get all the speaker and person lists
all_found_persons = ConferenceDataHandler.find_persons(conferences)
speaker_list = ConferenceDataHandler.get_speaker(conferences)
found_speaker_list = []
not_found_speaker_list = []
non_speaker_list = []
for found_persons, speaker_list in zip(all_found_persons, speaker_list):
add_found_speaker = [person.name.lower() for person in found_persons if person.is_speaker]
found_speaker_list.extend(add_found_speaker)
......@@ -31,7 +26,6 @@ for found_persons, speaker_list in zip(all_found_persons, speaker_list):
not person.is_speaker and person.is_speaker is not None])
not_found_speaker_list.extend(
[speaker.lower() for speaker in speaker_list if speaker.lower() not in add_found_speaker])
print(not_found_speaker_list)
print(found_speaker_list)
print(non_speaker_list)
......@@ -40,7 +34,7 @@ print(non_speaker_list)
train_person_list = []
for person_list in all_found_persons:
train_person_list.extend([person for person in person_list if person.is_speaker is not None])
f = open("data/01_speaker_train_data.json", "w")
f = open("src/julian/data/01_speaker_train_data.json", "w")
s = json.dumps([person.__dict__ for person in train_person_list])
f.write("%s\n" % s)
......@@ -48,7 +42,7 @@ f.write("%s\n" % s)
predict_person_list = []
for person_list in all_found_persons:
predict_person_list.extend([person for person in person_list if person.is_speaker is None])
f = open("data/01_speaker_predict_data.json", "w")
f = open("src/julian/data/01_speaker_predict_data.json", "w")
s = json.dumps([person.__dict__ for person in predict_person_list])
f.write("%s\n" % s)
......
import json
from typing import List
from components.website.conference import Conference
from components.website.conference_person import ConferencePerson
from src.julian.components.website.conference import Conference
from src.julian.components.website.conference_person import ConferencePerson
class ConferenceDataHandler:
......
import json
from typing import List, Dict
from analysis.text import NameFinder, KeywordFinder
from analysis.website import PageParser
from components.website.conference import Conference
from components.website.conference_person import ConferencePerson
from components.website.conference_single_page import ConferenceSinglePage
from src.julian.analysis.text import KeywordFinder, NameFinder
from src.julian.analysis.website import PageParser
from src.julian.components.website.conference import Conference
from src.julian.components.website.conference_person import ConferencePerson
from src.julian.components.website.conference_single_page import ConferenceSinglePage
class ConferenceWebsitePageParser:
......@@ -23,14 +23,12 @@ class ConferenceWebsitePageParser:
Loads a json file and creates a list of pages from it.
:param file: the file to read from
"""
print("Loading conference page data...")
with open(file) as f:
data = json.load(f)
for index, conf in enumerate(data):
conference = [c for c in self.conferences if c.name == conf['name']][0]
index = 0
for p in conf['speaker_pages']:
print("Create conference page instance for {}.".format(conference.name))
new_csp = self.create_conference_single_page(conference, index, p)
conference.sub_pages.append(new_csp)
index += 1
......
......@@ -2,13 +2,13 @@ import json
import pandas as pd
from tensorflow_utility.data.data_set import PredictionDataSet
from tensorflow_utility.data.tensorflow_features import TensorflowFeatures
from tensorflow_utility.model_usage.optimizer_utility import OptimizerFunctions
from tensorflow_utility.model_usage.predictor import Predictor
from tensorflow_utility.models.simple_dnn import SimpleDeepNeuralNetworkClassifier
from src.julian.tensorflow_utility.data.data_set import PredictionDataSet
from src.julian.tensorflow_utility.data.tensorflow_features import TensorflowFeatures
from src.julian.tensorflow_utility.model_usage.optimizer_utility import OptimizerFunctions
from src.julian.tensorflow_utility.model_usage.predictor import Predictor
from src.julian.tensorflow_utility.models.simple_dnn import SimpleDeepNeuralNetworkClassifier
person_predict_data = pd.read_json("data/01_speaker_train_data.json")
person_predict_data = pd.read_json("src/julian/data/01_speaker_train_data.json")
person_predict_data["is_speaker"] = pd.Categorical(person_predict_data["is_speaker"])
person_predict_data["is_speaker"] = person_predict_data["is_speaker"].cat.codes.astype(int)
......@@ -28,7 +28,7 @@ feature_columns = set([TensorflowFeatures.get_numeric(f) for f in features])
model = SimpleDeepNeuralNetworkClassifier(feature_columns,
OptimizerFunctions.create_optimizer(0.05, 5),
[20, 20, 10, 10],
"saved_models/predict_speaker_0",
"src/julian/saved_models/predict_speaker_0",
classes=2)
predictions = Predictor.predict_classification(model, PredictionDataSet(person_predict_data, features, targets))
......@@ -36,5 +36,5 @@ Predictor.print_prediction_classification(predictions, labels, person_predict_da
for person, prediction in zip(person_predict_data, predictions):
person["is_speaker"] = prediction['class_ids'][0]
f = open("data/02_speaker_predictions.json", "w")
f = open("src/julian/data/02_speaker_predictions.json", "w")
pd.DataFrame.to_json(person_predict_data[["name", "gender", "conference_name", "is_speaker"]], f, orient="records")
\ No newline at end of file
model_checkpoint_path: "model.ckpt-10000"
all_model_checkpoint_paths: "model.ckpt-10000"
model_checkpoint_path: "model.ckpt-40000"
all_model_checkpoint_paths: "model.ckpt-40000"
model_checkpoint_path: "model.ckpt-10000"
all_model_checkpoint_paths: "model.ckpt-10000"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment