Commit 0129901c authored by Julian Rogawski's avatar Julian Rogawski
Browse files

tensorflow wrapper added, improved name filtering, added gender estimator

parent 912e5a71
......@@ -6,6 +6,11 @@ name = "pypi"
[packages]
spacy = "*"
"beautifulsoup4" = "*"
tensorflow = "*"
pandas = "*"
sklearn = "*"
scipy = "*"
gender-ai = "*"
[dev-packages]
......
This diff is collapsed.
......@@ -17,7 +17,9 @@ class NameFinder:
doc = self.nlp(text)
raw_persons = [entity.text for entity in doc.ents if entity.label_ == 'PER']
filtered_persons = list(filter(lambda p: len(p.split(' ')) > 1, raw_persons)) # make sure we get no single names
# TODO improve this process
# make sure we get no single names and no extremely large names
filtered_persons = list(filter(lambda p: len(p.split(' ')) > 1 and len(p.split(' ')) < 5, raw_persons))
return filtered_persons
......
from typing import List
import gender_ai
from components.website.conference_person import ConferencePerson
from components.website.conference_single_page import ConferenceSinglePage
......@@ -56,11 +58,18 @@ class Conference:
is_speaker = True
# TODO calculate more statistics about the person from the conference page
new_conf_p = ConferencePerson(name=n,
gender="",
conference_name=self.name,
is_speaker=is_speaker)
conf_sp_p.append(new_conf_p)
try:
gender = gender_ai.predict(n.split()[0])
if gender.find("hard to guess") > -1:
gender = "Unknown"
new_conf_p = ConferencePerson(name=n,
gender=gender,
conference_name=self.name,
name_occurrence=o,
is_speaker=is_speaker)
conf_sp_p.append(new_conf_p)
except:
pass
# check for duplicates and update statistics of the persons
for cspp in conf_sp_p:
......
This diff is collapsed.
This source diff could not be displayed because it is too large. You can view the blob instead.
import json
import gender_ai
import gender_guesser.detector as gender
from analysis.utility import UtilityFunctions
from parser.conference_data_parser import ConferenceDataParser
from parser.website_page_data_parser import ConferenceWebsitePageParser
from handler.conference_data_handler import ConferenceDataHandler
from handler.website_page_data_parser import ConferenceWebsitePageParser
# parse the data files
conference_parser = ConferenceDataParser()
conference_parser.parse("data/conferences.json")
website_parser = ConferenceWebsitePageParser(conference_parser.conferences)
conferences = ConferenceDataHandler.parse("data/conferences.json")
filter_words = ConferenceDataHandler.get_filter_words(conferences)
filter_words.extend(["biography", "department", "university", "–", "abstract", "keynote", "bio"])
print(filter_words)
website_parser = ConferenceWebsitePageParser(conferences, filter_words)
website_parser.parse("data/pages.json")
# get all the speaker and person lists
all_known_speaker = []
all_found_persons = []
for c in conference_parser.conferences:
for s in c.speakers:
if s.name != "":
all_known_speaker.append(s)
all_found_persons.extend(c.get_conference_persons())
all_found_persons = ConferenceDataHandler.find_persons(conferences)
speaker_list = ConferenceDataHandler.get_speaker(conferences)
found_speaker_list = [person.name for person in all_found_persons if person.is_speaker]
not_found_speaker_list = [s for s in speaker_list if s not in found_speaker_list]
non_speaker_list = [person.name for person in all_found_persons if
not person.is_speaker and person.is_speaker is not None]
print(speaker_list)
print(not_found_speaker_list)
print(found_speaker_list)
print(non_speaker_list)
# ....
train_person_list = [person for person in all_found_persons if person.is_speaker is not None]
f = open("data/speaker_train_data.json", "w")
s = json.dumps([person.__dict__ for person in train_person_list])
f.write("%s\n" % s)
# .......
predict_person_list = [person for person in all_found_persons if person.is_speaker is None]
f = open("data/speaker_predict_data.json", "w")
s = json.dumps([person.__dict__ for person in predict_person_list])
f.write("%s\n" % s)
# shows stats of the found speakers/persons
actual_speaker = len(all_known_speaker)
actual_speaker = len(speaker_list)
found_speaker = len(found_speaker_list)
found_non_speaker = len(non_speaker_list)
print('Actual Speaker: {}, Found Speaker: {}, Non Speaker found: {}'.format(actual_speaker,
......
......@@ -4,19 +4,19 @@ from components.website.conference import Conference
from components.website.conference_person import ConferencePerson
class ConferenceDataParser:
class ConferenceDataHandler:
"""
A class for parsing a file into a list of conferences.
A class for utility functions handling conferences.
"""
def __init__(self):
self.conferences: List[Conference] = []
def parse(self, file: str):
@staticmethod
def parse(file: str) -> List[Conference]:
"""
Loads a json file and creates a list of conferences from it.
:param file: the file to read from
:return the list of conferences
"""
self.conferences = []
conferences = []
with open(file) as f:
data = json.load(f)
for index, conference in enumerate(data):
......@@ -32,5 +32,50 @@ class ConferenceDataParser:
p['organization']
) for p in conference['speakers']]
)
self.conferences.append(newConference)
conferences.append(newConference)
return conferences
@staticmethod
def get_speaker(conferences: List[Conference]) -> List[str]:
"""
Search every speaker of the conference.
:param conferences: the conferences to search from
:return the list of speaker names
"""
speakers = []
for c in conferences:
for s in c.speakers:
if s.name != "":
speakers.append(s.name)
return speakers
@staticmethod
def find_persons(conferences: List[Conference]) -> List[ConferencePerson]:
"""
Search every person mentioned on the conference speaker websites.
:param conferences: the conferences to search from
:return the list of speaker
"""
persons = []
for c in conferences:
persons.extend(c.get_conference_persons())
return persons
@staticmethod
def get_filter_words(conferences: List[Conference]) -> List[str]:
"""
Creates a list of some keywords, to filter them out later.
:param conferences: the conferences to search from
:return a list of words, that can be filtered out
"""
filter_words = []
for c in conferences:
for sf in c.sub_fields:
filter_words.extend(sf.split('-'))
filter_words.append(sf)
filter_words.extend([w for w in c.name.split() if len(w) > 3])
filter_words = [fw.lower() for fw in filter_words]
filter_words = list(set(filter_words))
return filter_words
import json
from difflib import SequenceMatcher
from typing import List, Tuple
from analysis.text import NameFinder, KeywordFinder
......@@ -12,10 +13,11 @@ class ConferenceWebsitePageParser:
"""
A class for parsing a file into a list of website pages.
"""
def __init__(self, conferences: List[Conference]):
def __init__(self, conferences: List[Conference], filtered_words: List[str] = None):
self.conferences: List[Conference] = conferences
self.conference_persons: List[ConferencePerson] = []
self.name_finder = NameFinder()
self.filtered_words: List[str] = filtered_words
def parse(self, file: str):
"""
......@@ -44,6 +46,24 @@ class ConferenceWebsitePageParser:
body_text = page_parser.text_from_html()
name_list = self.get_names_and_occurrences(body_text)
"""name_count = len(name_list)
filtered_name_list = []
for n, o in name_list:
filter_out = False
for sn in n.split(" "):
for sf in conference.sub_fields:
for ssf in sf.split(" "):
if SequenceMatcher(None, sn, ssf).ratio() > 0.5:
filter_out = True
if not filter_out:
filtered_name_list.append((n, o))
name_list = filtered_name_list
#Actual Speaker: 192, Found Speaker: 136, Non Speaker found: 1314
filtered_name_count = len(name_list)
if name_count > filtered_name_count:
print(name_count - filtered_name_count)"""
image_count = page_parser.image_count()
word_count = len(body_text.split())
organization_count = 0
......@@ -67,7 +87,9 @@ class ConferenceWebsitePageParser:
:param text: the text to search for names in
:return: the list of names paired with occurrences
"""
raw_name_list = self.name_finder.get_persons(text)
filtered_text = self.filter_text(text)
raw_name_list = self.name_finder.get_persons(filtered_text)
unique_name_list = []
for raw_name in raw_name_list:
is_name_unique = True
......@@ -80,3 +102,19 @@ class ConferenceWebsitePageParser:
for name in unique_name_list:
name_list.append((name, len([raw_name for raw_name in raw_name_list if raw_name.lower() == name.lower()])))
return name_list
def filter_text(self, text) -> str:
"""
Filters a text with some keywords.
:param text: the text to be filtered
:return the filtered text
"""
filtered_text = ""
words_from_text = text.split()
filtered_words = [w for w in words_from_text if w.lower() not in self.filtered_words]
for fw in filtered_words:
filtered_text = filtered_text + " " + fw
return filtered_text
from math import floor, ceil
import pandas as pd
class DataPartitioning:
"""
Utility functions for data partitioning.
"""
@staticmethod
def train_validate_split(feature_data: pd.DataFrame,
target_data: pd.DataFrame,
training_share: float,
validation_share: float = None) \
-> (pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame):
"""
Splits feature and target DataFrame in model_usage and validation portions.
:param feature_data: The pandas DataFrame for the features.
:param target_data: The pandas DataFrame for the targets.
:param training_share: The percentage of the data that should be used for model_usage.
:param validation_share: The percentage of the data that should be used for validation.
:return The split DataFrames: training_features, training_targets, validation_features, validation_targets.
"""
# calculate the sizes of the DataFrame splits
scope = feature_data.shape[0]
training_amount = ceil(training_share * scope)
validation_amount = scope - training_amount
if validation_share is not None:
validation_amount = floor(validation_share * scope)
# split the DataFrames
training_features = feature_data.head(training_amount).copy()
training_targets = target_data.head(training_amount).copy()
validation_features = feature_data.tail(validation_amount).copy()
validation_targets = target_data.tail(validation_amount).copy()
return training_features, training_targets, validation_features, validation_targets
@staticmethod
def train_validate_test_split(feature_data: pd.DataFrame,
target_data: pd.DataFrame,
training_share: float,
validation_share: float,
testing_share: float = None) \
-> (pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame):
"""
Splits feature and target DataFrame in model_usage and validation portions.
:param feature_data: The pandas DataFrame for the features.
:param target_data: The pandas DataFrame for the targets.
:param training_share: The percentage of the data that should be used for model_usage.
:param validation_share: The percentage of the data that should be used for validation.
:param testing_share: The percentage of the data that should be used for testing.
:return The split DataFrames: training_features, training_targets, validation_features, validation_targets, testing_features, testing_targets.
"""
# calculate the sizes of the DataFrame splits
scope = feature_data.shape[0]
training_amount = floor(training_share * scope)
validation_amount = floor(validation_share * scope)
testing_amount = scope - (training_amount + validation_amount)
if testing_share is not None:
testing_amount = floor(testing_share * scope)
# split the DataFrames
training_features = feature_data.head(training_amount).copy()
training_targets = target_data.head(training_amount).copy()
remaining_features = feature_data.tail(scope - training_amount)
remaining_targets = target_data.tail(scope - training_amount)
validation_features = remaining_features.head(validation_amount).copy()
validation_targets = remaining_targets.head(validation_amount).copy()
testing_features = remaining_features.tail(testing_amount).copy()
testing_targets = remaining_targets.tail(testing_amount).copy()
return training_features, training_targets, validation_features, validation_targets, testing_features, testing_targets
import numpy as np
import pandas as pd
def get_quantile_based_buckets(data: pd.DataFrame, num_buckets):
"""
Takes the DataFrame and creates thresholds for the buckets which defines how the data should be categorised.
Quantile means the thresholds of the Buckets are define in such a way,
that every bucket contains an approximately equal amount of data.
:param data: the data on which the bucket ranges should be generated from
:param num_buckets: the number of buckets
:return: the thresholds of the buckets
"""
quantiles = data.quantile([(i + 1.) / (num_buckets + 1.) for i in range(num_buckets)])
return [quantiles[q] for q in quantiles.keys()]
def get_linear_based_buckets(data: pd.DataFrame, num_buckets):
"""
Takes the DataFrame and creates thresholds for the buckets which defines how the data should be categorised.
The range of the value will be linear splitted between max and min value of the dataframe and will be taken as the
thresholds.
:param data: the data on which the bucket ranges should be generated from
:param num_buckets: the number of buckets
:return: the thresholds of the buckets
"""
return np.linspace(data.min(), data.max(), num_buckets)
from typing import List
import pandas as pd
from tensorflow_utility.data.data_processing.normalize_functions import linear_scale
class DataProcessingFunctions:
"""
Utility functions for feature processing
"""
@staticmethod
def create_targets(source_data: pd.DataFrame, targets: List[str]) -> pd.DataFrame:
"""
Filters features from a DataFrame.
:param source_data: The pandas DataFrame from which the targets should be selected from.
:param targets: The list of features used as targets.
:return A new DataFrame with selected features.
"""
output_targets = source_data[targets]
return output_targets.copy()
@staticmethod
def add_targets(target_data: pd.DataFrame, targets: List[str], data):
"""
Adds targets to a DataFrame.
:param target_data: The pandas DataFrame to which the targets should be added to.
:param targets: The list of target names.
:param data: The list of target data.
"""
for t in targets:
target_data[t] = data
@staticmethod
def filter_features(data: pd.DataFrame, features: List[str]) -> pd.DataFrame:
"""
Filters features from a DataFrame.
:param data: The pandas DataFrame from which the features should be selected from.
:param features: The list of features to keep.
:return A new DataFrame with selected features.
"""
selected_features = data[features]
return selected_features.copy()
@staticmethod
def normalize(feature: pd.DataFrame, normalize_function=linear_scale):
"""
Normalizes the listed features of a dataframe with a given normalization function.
:param feature: the feature that should be normalized
:param normalize_function: the normalization function
"""
feature = normalize_function(feature)
return feature
import math
def linear_scale(series):
min_val = series.min()
max_val = series.max()
scale = (max_val - min_val) / 2.0
return series.apply(lambda x: ((x - min_val) / scale) - 1.0)
def log_normalize(series):
return series.apply(lambda x: math.log(x + 1.0))
def clip(series, clip_to_min, clip_to_max):
return series.apply(lambda x: (min(max(x, clip_to_min), clip_to_max)))
def z_score_normalize(series):
mean = series.mean()
std_dv = series.std()
return series.apply(lambda x: (x - mean) / std_dv)
def binary_threshold(series, threshold):
return series.apply(lambda x: (1 if x > threshold else 0))
from typing import List
import pandas as pd
from tensorflow_utility.data.data_processing.data_processing_functions import DataProcessingFunctions
from tensorflow_utility.data.data_partitioning import DataPartitioning
class PredictionDataSet:
"""
A set of Data divided into features and targets for prediction.
"""
def __init__(self, data: pd.DataFrame, features: List[str], targets: List[str]):
"""
Initializes the DataSet.
:param data: the origin data
:param features: the selected feature data
:param targets: the selected target data
"""
self.features: List[str] = features
self.targets: List[str] = targets
self.prediction_base: pd.DataFrame = DataProcessingFunctions.filter_features(data, self.features)
self.prediction_result: pd.DataFrame = DataProcessingFunctions.create_targets(data, self.targets)
class TrainingDataSet:
"""
A set of Data divided in training, validation and testing (if needed) data for training.
"""
def __init__(self, data: pd.DataFrame, features: List[str], targets: List[str]):
"""
Initializes the DataSet.
:param data: the origin data
:param features: the selected feature data
:param targets: the selected target data
"""
self.original_data: pd.DataFrame = data.copy()
self.features: List[str] = features
self.targets: List[str] = targets
self.feature_data: pd.DataFrame = None
self.target_data: pd.DataFrame = None
self.training_examples: pd.DataFrame = None
self.training_targets: pd.DataFrame = None
self.validation_examples: pd.DataFrame = None
self.validation_targets: pd.DataFrame = None
self.testing_examples: pd.DataFrame = None
self.testing_targets: pd.DataFrame = None
def split_data(self, training_share: float, validation_share: float = None, shuffle: bool = True):
"""
Splits the data into training, validation (and testing) data.
:param training_share: the percentage of training data
:param validation_share: the percentage of validation data
:param shuffle: Shuffles the data before splitting if True
"""
splitting_df = self.original_data
if shuffle:
splitting_df = self.original_data.reindex(pd.np.random.permutation(self.original_data.index))
self.feature_data = DataProcessingFunctions.filter_features(splitting_df, self.features)
self.target_data = DataProcessingFunctions.create_targets(splitting_df, self.targets)
if validation_share is None:
df_list = DataPartitioning.train_validate_split(
self.feature_data, self.target_data, training_share)
else:
df_list = DataPartitioning.train_validate_test_split(
self.feature_data, self.target_data, training_share, validation_share)
self.training_examples = df_list[0]
self.training_targets = df_list[1]
self.validation_examples = df_list[2]
self.validation_targets = df_list[3]
if len(df_list) > 4:
self.testing_examples = df_list[4]
self.testing_targets = df_list[5]
else:
self.testing_examples = None
self.testing_targets = None
from typing import Set
import tensorflow as tf
import pandas as pd
from tensorflow.python.feature_column.feature_column import _FeatureColumn
from tensorflow_utility.data.data_processing.bucketize_functions import get_quantile_based_buckets
class TensorflowFeatures:
"""
Utility Class for creating TensorFlow feature columns used by a model.
"""
@staticmethod
def get_numeric(data: str) -> _FeatureColumn:
"""
Creates a real valued or numerical TensorFlow feature column.
:param data: the data, on which the TensorFlow feature column is based on
:return: real valued or numerical TensorFlow feature column
"""
return tf.feature_column.numeric_column(data)
@staticmethod
def get_bucket(data: str,
bucket_count: int,
bucketize_source_data: pd.DataFrame,
bucketize_function=get_quantile_based_buckets) -> _FeatureColumn:
"""
Creates a bucketized numerical TensorFlow feature column.
:param data: the data, on which the TensorFlow feature column is based on
:param bucket_count: the number of buckets
:param bucketize_source_data: the data on which the bucketize_function operates
:param bucketize_function: the function for creating the bucket ranges
:return: a bucketized TensorFlow feature column
"""
numeric_column = tf.feature_column.numeric_column(data)
return tf.feature_column.bucketized_column(numeric_column,
boundaries=bucketize_function(bucketize_source_data, bucket_count))
@staticmethod
def get_cross(feature_columns: Set[_FeatureColumn]) -> Set[_FeatureColumn]:
"""
Creates a crossed bucketized TensorFlow feature column. Does not work with DNN's.