Commit 27d2d2fc authored by Julian Rogawski's avatar Julian Rogawski
Browse files

updated prediction

parent 4b7ed143
from typing import List
from bs4 import BeautifulSoup
from bs4.element import Comment
......@@ -33,7 +35,101 @@ class PageParser:
"""
texts = self.soup.findAll(text=True)
visible_texts = filter(PageParser.tag_visible, texts)
return u" ".join(t.strip() for t in visible_texts)
links = self.soup.findAll('a')
images = self.soup.find_all('img')
headers = self.soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])
text = ""
for link in links:
if link.string:
text += " " + link.string.strip()
for img in images:
text += " " + img.get('alt', '').strip()
for h in headers:
text += " " + h.text.strip()
return text + u" ".join(t.strip() for t in visible_texts)
def name_stats(self, names: List[str]):
name_stats = dict()
speaker_name_stats = dict()
links = self.soup.findAll('a')
images = self.soup.find_all('img')
headers = self.soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])
for name in names:
name_stats[name] = [0, 0, 0]
speaker_name_stats[name] = [0, 0, 0]
for link in links:
if link.srting:
if name.lower() == link.string.strip().lower():
name_stats[name][0] += 1
for image in images:
if image.get('alt', ''):
if name.lower() == image.get('alt', '').strip().lower():
name_stats[name][1] += 1
for header in headers:
if header.text:
if name.lower() == header.text.strip().lower():
name_stats[name][2] += 1
for h in self.soup.find_all("div"):
parent_speaker_div = False
if h.parent:
header = [head.text for head in h.parent.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])]
for head in header:
if head.find("speaker") or head.find("Speaker"):
parent_speaker_div = True
if parent_speaker_div:
speaker_div = False
header = [head.text for head in h.find_all(["h1", "h2", "h3", "h4", "h5", "h6"], recursive=False)]
for head in header:
if head.find("speaker") or head.find("Speaker"):
speaker_div = True
if speaker_div:
links = h.findAll('a')
images = h.find_all('img')
headers = h.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])
for name in names:
for link in links:
if link.srting:
if name.lower() == link.string.strip().lower():
speaker_name_stats[name][0] += 1
for image in images:
if image.get('alt', ''):
if name.lower() == image.get('alt', '').strip().lower():
speaker_name_stats[name][1] += 1
for header in headers:
if header.text:
if name.lower() == header.text.strip().lower():
speaker_name_stats[name][2] += 1
"""child_speaker_div = False
children = h.findChildren(recursive=False)
for child in children:
header = [head.text for head in child.find_all(["h1", "h2", "h3", "h4", "h5", "h6"], recursive=False)]
for head in header:
if head.find("speaker") or head.find("Speaker"):
child_speaker_div = True
if child_speaker_div:
children = h.findChildren(recursive=False)
for child in children:
links = child.findAll('a')
images = child.find_all('img')
headers = child.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])
print(child.text)
for name in names:
for link in links:
if name == link.string:
speaker_name_stats[name][0] += 1
for image in images:
if name == image.get('alt', ''):
speaker_name_stats[name][1] += 1
for header in headers:
if name == header.text:
speaker_name_stats[name][2] += 1"""
return name_stats, speaker_name_stats
def image_count(self):
"""
......
......@@ -45,7 +45,7 @@ class Conference:
conf_p = []
for sp in self.sub_pages:
conf_sp_p = []
for n, o in sp.name_list:
for n in sp.name_list:
is_speaker = None
speakers_set = True
for s in self.speakers:
......@@ -65,7 +65,13 @@ class Conference:
new_conf_p = ConferencePerson(name=n,
gender=gender,
conference_name=self.name,
name_occurrence=o,
name_occurrence=sp.name_occurrence[n],
name_link_occurrence=sp.name_stats[n][0],
name_image_occurrence=sp.name_stats[n][1],
name_header_occurrence=sp.name_stats[n][2],
speaker_name_link_occurrence=sp.speaker_name_stats[n][0],
speaker_name_image_occurrence=sp.speaker_name_stats[n][1],
speaker_name_header_occurrence=sp.speaker_name_stats[n][2],
is_speaker=is_speaker)
conf_sp_p.append(new_conf_p)
except:
......
......@@ -8,6 +8,12 @@ class ConferencePerson:
conference_name: str,
name_occurrence: int = 0,
word_count_occurrence: int = 0,
name_link_occurrence: int = 0,
name_image_occurrence: int = 0,
name_header_occurrence: int = 0,
speaker_name_link_occurrence: int = 0,
speaker_name_image_occurrence: int = 0,
speaker_name_header_occurrence: int = 0,
is_speaker: bool = None):
"""
Initializes the class with some optional variables.
......@@ -25,6 +31,12 @@ class ConferencePerson:
# features
self.name_occurrence: int = name_occurrence
self.word_count_occurrence: int = word_count_occurrence
self.name_link_occurrence: int = name_link_occurrence
self.name_image_occurrence: int = name_image_occurrence
self.name_header_occurrence: int = name_header_occurrence
self.speaker_name_link_occurrence: int = speaker_name_link_occurrence
self.speaker_name_image_occurrence: int = speaker_name_image_occurrence
self.speaker_name_header_occurrence: int = speaker_name_header_occurrence
# label
self.is_speaker: bool = is_speaker
\ No newline at end of file
......@@ -10,7 +10,10 @@ class ConferenceSinglePage:
conference,
url: str,
speaker_page: bool,
name_list: List[Tuple[str, int]],
name_list: List[str],
name_occurrence: Dict[str,int],
name_stats: Dict[str, List],
speaker_name_stats: Dict[str, List],
image_count: int,
word_count: int,
organization_count: int,
......@@ -34,7 +37,10 @@ class ConferenceSinglePage:
self.conference = conference
self.url: str = url
self.speaker_page: bool = speaker_page
self.name_list: List[Tuple[str, int]] = name_list
self.name_list: List[str] = name_list
self.name_occurrence: Dict[str, int] = name_occurrence
self.name_stats: Dict[str, List] = name_stats
self.speaker_name_stats: Dict[str, List] = speaker_name_stats
# additional info
self.image_count: int = image_count
......
This diff is collapsed.
This diff is collapsed.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This diff is collapsed.
......@@ -8,41 +8,52 @@ from handler.conference_data_handler import ConferenceDataHandler
from handler.website_page_data_parser import ConferenceWebsitePageParser
# parse the data files
print("Read conference data...")
conferences = ConferenceDataHandler.parse("data/conferences.json")
filter_words = ConferenceDataHandler.get_filter_words(conferences)
filter_words.extend(["biography", "department", "university", "–", "abstract", "keynote", "bio"])
print(filter_words)
print("Read speaker website data...")
website_parser = ConferenceWebsitePageParser(conferences, filter_words)
website_parser.parse("data/pages.json")
# get all the speaker and person lists
all_found_persons = ConferenceDataHandler.find_persons(conferences)
speaker_list = ConferenceDataHandler.get_speaker(conferences)
found_speaker_list = [person.name for person in all_found_persons if person.is_speaker]
not_found_speaker_list = [s for s in speaker_list if s not in found_speaker_list]
non_speaker_list = [person.name for person in all_found_persons if
not person.is_speaker and person.is_speaker is not None]
found_speaker_list = []
not_found_speaker_list = []
non_speaker_list = []
for found_persons, speaker_list in zip(all_found_persons, speaker_list):
add_found_speaker = [person.name.lower() for person in found_persons if person.is_speaker]
found_speaker_list.extend(add_found_speaker)
non_speaker_list.extend([person.name.lower() for person in found_persons if
not person.is_speaker and person.is_speaker is not None])
not_found_speaker_list.extend(
[speaker.lower() for speaker in speaker_list if speaker.lower() not in add_found_speaker])
print(speaker_list)
print(not_found_speaker_list)
print(found_speaker_list)
print(non_speaker_list)
# ....
train_person_list = [person for person in all_found_persons if person.is_speaker is not None]
f = open("data/speaker_train_data.json", "w")
# create train data
train_person_list = []
for person_list in all_found_persons:
train_person_list.extend([person for person in person_list if person.is_speaker is not None])
f = open("data/01_speaker_train_data.json", "w")
s = json.dumps([person.__dict__ for person in train_person_list])
f.write("%s\n" % s)
# .......
predict_person_list = [person for person in all_found_persons if person.is_speaker is None]
f = open("data/speaker_predict_data.json", "w")
# create predict data
predict_person_list = []
for person_list in all_found_persons:
predict_person_list.extend([person for person in person_list if person.is_speaker is None])
f = open("data/01_speaker_predict_data.json", "w")
s = json.dumps([person.__dict__ for person in predict_person_list])
f.write("%s\n" % s)
# shows stats of the found speakers/persons
actual_speaker = len(speaker_list)
actual_speaker = len(found_speaker_list) + len(not_found_speaker_list)
found_speaker = len(found_speaker_list)
found_non_speaker = len(non_speaker_list)
print('Actual Speaker: {}, Found Speaker: {}, Non Speaker found: {}'.format(actual_speaker,
......
......@@ -36,7 +36,7 @@ class ConferenceDataHandler:
return conferences
@staticmethod
def get_speaker(conferences: List[Conference]) -> List[str]:
def get_speaker(conferences: List[Conference]) -> List[List[str]]:
"""
Search every speaker of the conference.
:param conferences: the conferences to search from
......@@ -44,13 +44,15 @@ class ConferenceDataHandler:
"""
speakers = []
for c in conferences:
add_speaker = []
for s in c.speakers:
if s.name != "":
speakers.append(s.name)
add_speaker.append(s.name)
speakers.append(add_speaker)
return speakers
@staticmethod
def find_persons(conferences: List[Conference]) -> List[ConferencePerson]:
def find_persons(conferences: List[Conference]) -> List[List[ConferencePerson]]:
"""
Search every person mentioned on the conference speaker websites.
:param conferences: the conferences to search from
......@@ -58,7 +60,7 @@ class ConferenceDataHandler:
"""
persons = []
for c in conferences:
persons.extend(c.get_conference_persons())
persons.append(c.get_conference_persons())
return persons
@staticmethod
......
import json
from difflib import SequenceMatcher
from typing import List, Tuple
from typing import List, Dict
from analysis.text import NameFinder, KeywordFinder
from analysis.website import PageParser
......@@ -24,12 +23,14 @@ class ConferenceWebsitePageParser:
Loads a json file and creates a list of pages from it.
:param file: the file to read from
"""
print("Loading conference page data...")
with open(file) as f:
data = json.load(f)
for index, conf in enumerate(data):
conference = [c for c in self.conferences if c.name == conf['name']][0]
index = 0
for p in conf['speaker_pages']:
print("Create conference page instance for {}.".format(conference.name))
new_csp = self.create_conference_single_page(conference, index, p)
conference.sub_pages.append(new_csp)
index += 1
......@@ -44,25 +45,9 @@ class ConferenceWebsitePageParser:
"""
page_parser = PageParser(body)
body_text = page_parser.text_from_html()
name_list = self.get_names_and_occurrences(body_text)
"""name_count = len(name_list)
filtered_name_list = []
for n, o in name_list:
filter_out = False
for sn in n.split(" "):
for sf in conference.sub_fields:
for ssf in sf.split(" "):
if SequenceMatcher(None, sn, ssf).ratio() > 0.5:
filter_out = True
if not filter_out:
filtered_name_list.append((n, o))
name_list = filtered_name_list
#Actual Speaker: 192, Found Speaker: 136, Non Speaker found: 1314
filtered_name_count = len(name_list)
if name_count > filtered_name_count:
print(name_count - filtered_name_count)"""
name_occurrence = self.get_names_and_occurrences(body_text)
name_list = list(name_occurrence.keys())
name_stats, speaker_name_stats = page_parser.name_stats(name_list)
image_count = page_parser.image_count()
word_count = len(body_text.split())
......@@ -73,6 +58,9 @@ class ConferenceWebsitePageParser:
url=conference.speaker_urls[url_index],
speaker_page=True,
name_list=name_list,
name_occurrence=name_occurrence,
name_stats=name_stats,
speaker_name_stats=speaker_name_stats,
image_count=image_count,
word_count=word_count,
organization_count=organization_count,
......@@ -81,7 +69,7 @@ class ConferenceWebsitePageParser:
negative_keyword_count=negative_kc)
return new_csp
def get_names_and_occurrences(self, text) -> List[Tuple[str, int]]:
def get_names_and_occurrences(self, text) -> Dict[str, int]:
"""
Finds and creates a list of the person names combined with the occurrences in a text.
:param text: the text to search for names in
......@@ -98,9 +86,9 @@ class ConferenceWebsitePageParser:
is_name_unique = False
if is_name_unique:
unique_name_list.append(raw_name)
name_list = []
name_list = dict()
for name in unique_name_list:
name_list.append((name, len([raw_name for raw_name in raw_name_list if raw_name.lower() == name.lower()])))
name_list[name] = len([raw_name for raw_name in raw_name_list if raw_name.lower() == name.lower()])
return name_list
def filter_text(self, text) -> str:
......
import json
import pandas as pd
from tensorflow_utility.data.data_set import PredictionDataSet
from tensorflow_utility.data.tensorflow_features import TensorflowFeatures
from tensorflow_utility.model_usage.optimizer_utility import OptimizerFunctions
from tensorflow_utility.model_usage.predictor import Predictor
from tensorflow_utility.models.simple_dnn import SimpleDeepNeuralNetworkClassifier
person_predict_data = pd.read_json("data/01_speaker_train_data.json")
person_predict_data["is_speaker"] = pd.Categorical(person_predict_data["is_speaker"])
person_predict_data["is_speaker"] = person_predict_data["is_speaker"].cat.codes.astype(int)
features = ["name_occurrence",
"name_link_occurrence",
"name_image_occurrence",
"name_header_occurrence",
"speaker_name_link_occurrence",
"speaker_name_image_occurrence",
"speaker_name_header_occurrence"]
targets = ["is_speaker"]
labels = ["Non-Speaker", "Speaker"]
feature_columns = set([TensorflowFeatures.get_numeric(f) for f in features])
model = SimpleDeepNeuralNetworkClassifier(feature_columns,
OptimizerFunctions.create_optimizer(0.05, 5),
[20, 20, 10, 10],
"saved_models/predict_speaker_0",
classes=2)
predictions = Predictor.predict_classification(model, PredictionDataSet(person_predict_data, features, targets))
Predictor.print_prediction_classification(predictions, labels, person_predict_data["is_speaker"].astype(int))
for person, prediction in zip(person_predict_data, predictions):
person["is_speaker"] = prediction['class_ids'][0]
f = open("data/02_speaker_predictions.json", "w")
pd.DataFrame.to_json(person_predict_data[["name", "gender", "conference_name", "is_speaker"]], f, orient="records")
\ No newline at end of file
model_checkpoint_path: "model.ckpt-10000"
all_model_checkpoint_paths: "model.ckpt-10000"
import tensorflow as tf
from tensorflow.python.training.adagrad import AdagradOptimizer
from tensorflow.python.training.adam import AdamOptimizer
from tensorflow.python.training.optimizer import Optimizer
......@@ -11,7 +12,7 @@ class OptimizerFunctions:
@staticmethod
def create_optimizer(learning_rate: float,
clip_gradients: float = None,
optimizer_type: Optimizer.__class__ = AdagradOptimizer) -> Optimizer:
optimizer_type: Optimizer.__class__ = AdamOptimizer) -> Optimizer:
"""
Creates a TensorFlowOptimizer.
:param learning_rate: pandas DataFrame of features
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment