Commit 3af74f18 authored by Orkut Karaçalık's avatar Orkut Karaçalık
Browse files

Merge branch 'master' of

parents 29b5e7bf df956925
......@@ -10,4 +10,5 @@ scrapers/*.json
\ No newline at end of file
\ No newline at end of file
url = ""
verify_ssl = true
name = "pypi"
spacy = "*"
"beautifulsoup4" = "*"
python_version = "3.6"
"_meta": {
"hash": {
"sha256": "eb4da93af39ac3bb0d5339ad958e2ebb395fb138014ed668f6f7d2114393956e"
"pipfile-spec": 6,
"requires": {
"python_version": "3.6"
"sources": [
"name": "pypi",
"url": "",
"verify_ssl": true
"default": {
"beautifulsoup4": {
"hashes": [
"index": "pypi",
"version": "==4.6.0"
"cymem": {
"hashes": [
"version": "==1.31.2"
"cytoolz": {
"hashes": [
"version": "==0.8.2"
"dill": {
"hashes": [
"version": "=="
"msgpack-numpy": {
"hashes": [
"version": "==0.4.1"
"msgpack-python": {
"hashes": [
"version": "==0.5.6"
"murmurhash": {
"hashes": [
"version": "==0.28.0"
"numpy": {
"hashes": [
"version": "==1.14.5"
"pathlib": {
"hashes": [
"version": "==1.0.1"
"plac": {
"hashes": [
"version": "==0.9.6"
"preshed": {
"hashes": [
"version": "==1.0.0"
"regex": {
"hashes": [
"version": "==2017.4.5"
"six": {
"hashes": [
"version": "==1.11.0"
"spacy": {
"hashes": [
"index": "pypi",
"version": "==2.0.11"
"termcolor": {
"hashes": [
"version": "==1.1.0"
"thinc": {
"hashes": [
"version": "==6.10.2"
"toolz": {
"hashes": [
"version": "==0.9.0"
"tqdm": {
"hashes": [
"version": "==4.23.4"
"ujson": {
"hashes": [
"version": "==1.35"
"wrapt": {
"hashes": [
"version": "==1.10.11"
"develop": {}
import spacy
class NameFinder:
A class for text analysis, for name finding.
def __init__(self):
self.nlp = spacy.load('xx_ent_wiki_sm') # loads this model for identifying names
def get_persons(self, text):
Creates a list of person names from a text.
:param text: The text in which this functions searches for names.
:return: list of found person names.
doc = self.nlp(text)
raw_persons = [entity.text for entity in doc.ents if entity.label_ == 'PER']
filtered_persons = list(filter(lambda p: len(p.split(' ')) > 1, raw_persons)) # make sure we get no single names
return filtered_persons
def get_organisations(self, text):
Creates a list of organisation names from a text.
:param text: The text in which this functions searches for names.
:return: list of found organisation names.
doc = self.nlp(text)
organisations = [entity.text for entity in doc.ents if entity.label_ == 'ORG']
return organisations
class KeywordFinder:
A class for text analysis, for keyword finding.
positive_keywords = ['Speaker',
neutral_keywords = [ 'Conference',
negative_keywords = ['Organize',
def get_keyword_count(text: str):
Counts and returns the number of found keywords.
:param text: The text in which this functions searches for keywords.
:return: dicts of keywords with number of occurrences for each category of keywords
positive_keyword_counts = dict()
neutral_keyword_counts = dict()
negative_keyword_counts = dict()
for k in KeywordFinder.positive_keywords:
positive_keyword_counts[k] = text.lower().count(k.lower())
for k in KeywordFinder.neutral_keywords:
neutral_keyword_counts[k] = text.lower().count(k.lower())
for k in KeywordFinder.negative_keywords:
negative_keyword_counts[k] = text.lower().count(k.lower())
return positive_keyword_counts, neutral_keyword_counts, negative_keyword_counts
class UtilityFunctions:
Class for useful analyse functions.
def calc_prec_recall(true_positive: int, false_positive: int, all_positives: int):
Calculates recall, precision, the true negative rate and accuracy from input values.
:param true_positive: number of true positives
:param false_positive: number of false positives
:param all_positives: number of all positives
:return: recall, precision, the true negative rate, accuracy
tp = true_positive
fp = false_positive
tn = 0
fn = all_positives - true_positive
recall = tp / (tp + fn)
precision = tp / (tp + fp)
true_negative_rate = tn/(tn + fp)
accuracy = (tp + tn)/(tp + tn + fp + fn)
return recall, precision, true_negative_rate, accuracy
\ No newline at end of file
from bs4 import BeautifulSoup
from bs4.element import Comment
class PageParser:
Class for parsing pages of websites.
def __init__(self, body):
Initializes the parser with the html body of a page.
:param body: html body to parse
self.soup = BeautifulSoup(body, 'html.parser')
def tag_visible(element):
Checks if an element is visible for visitors of the website.
:param element: the element which should be checked for visibility
:return: True if visible, False otherwise
if in ['style', 'script', 'head', 'title', 'meta', '[document]']:
return False
if isinstance(element, Comment):
return False
return True
def text_from_html(self):
Filters out the complete visible text of a page.
:return: the filtered text from the page
texts = self.soup.findAll(text=True)
visible_texts = filter(PageParser.tag_visible, texts)
return u" ".join(t.strip() for t in visible_texts)
def image_count(self):
Counts the images of a page.
:return: the number of images on a page.
return len(self.soup.find_all('img'))
\ No newline at end of file
from typing import List
from import ConferencePerson
from import ConferenceSinglePage
class Conference:
Representative class of a conference for calculation purposes.
def __init__(self,
name: str,
url: str,
speaker_urls: List[str],
field: str,
sub_fields: List[str],
date: str,
speakers: List[ConferencePerson]):
Initializes the class.
:param name: the name of the conference
:param url: the url of the conference website
:param speaker_urls: the urls of the pages from the website, that has the speakers listet
:param field: the general field of research of this conference
:param sub_fields: the sub fields this conference covers
:param date: the date of the conference
:param speakers: the speakers of the conference
""" str = name
self.url: str = url
self.speaker_urls: List[str] = speaker_urls
self.field: str = field
self.sub_fields: List[str] = sub_fields str = date
self.speakers: List[ConferencePerson] = speakers
self.sub_pages: List[ConferenceSinglePage] = []
def get_conference_persons(self) -> List[ConferencePerson]:
Creates a list of every person found on this conference's website with some statistics
:return: a list of found persons for this conference
conf_p = []
for sp in self.sub_pages:
conf_sp_p = []
for n, o in sp.name_list:
is_speaker = None
speakers_set = True
for s in self.speakers:
if == "":
speakers_set = False
if speakers_set:
is_speaker = False
for s in self.speakers:
if == n:
is_speaker = True
# TODO calculate more statistics about the person from the conference page
new_conf_p = ConferencePerson(name=n,
# check for duplicates and update statistics of the persons
for cspp in conf_sp_p:
is_new_conf_p = True
for cp in conf_p:
if ==
is_new_conf_p = False
# TODO modify/calculate more statistics about the person from the conference page
if is_new_conf_p:
return conf_p
class ConferencePerson:
A class for a person associated with a conference.
def __init__(self,
name: str,
gender: str,
conference_name: str,
name_occurrence: int = 0,
word_count_occurrence: int = 0,
is_speaker: bool = None):
Initializes the class with some optional variables.
:param name: the name of the person
:param gender: the gender of the person
:param conference_name: the name of the associated conference
:param name_occurrence: the number of occurrences of this person's name
:param word_count_occurrence: the number of words occurring along with this person's name
:param is_speaker: labels this person as a speaker at the conference (or not), None for unknown
""" str = name
self.gender: str = gender
self.conference_name: str = conference_name
# features
self.name_occurrence: int = name_occurrence
self.word_count_occurrence: int = word_count_occurrence
# label
self.is_speaker: bool = is_speaker
\ No newline at end of file
from typing import List, Dict, Tuple
class ConferenceSinglePage:
A Class that represents a sub page of the conference website.
It is used for storing and creating statistics about the page.
def __init__(self,
url: str,
speaker_page: bool,
name_list: List[Tuple[str, int]],
image_count: int,
word_count: int,
organization_count: int,
positive_keyword_count: Dict[str, int],
neutral_keyword_count: Dict[str, int],
negative_keyword_count: Dict[str, int]):
Initialises this class.
:param conference: the associated conference
:param url: the url of this page
:param speaker_page: states if this page is a page containing speaker
:param name_list: a list of person names, found on this page
:param image_count: the count of images on this page
:param word_count: the count of words shown for a user on this page
:param organization_count: the count of organization names appearing on this page
:param positive_keyword_count: the count of positive keywords found on this page
:param neutral_keyword_count: the count of neutral keywords found on this page
:param negative_keyword_count: the count of negative keywords found on this page
# base info
self.conference = conference
self.url: str = url
self.speaker_page: bool = speaker_page
self.name_list: List[Tuple[str, int]] = name_list
# additional info
self.image_count: int = image_count
self.word_count: int = word_count
self.organization_count: int = organization_count
self.positive_keyword_count: Dict[str, int] = positive_keyword_count
self.neutral_keyword_count: Dict[str, int] = neutral_keyword_count
self.negative_keyword_count: Dict[str, int] = negative_keyword_count
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
from analysis.utility import UtilityFunctions
from parser.conference_data_parser import ConferenceDataParser
from parser.website_page_data_parser import ConferenceWebsitePageParser
# parse the data files
conference_parser = ConferenceDataParser()
website_parser = ConferenceWebsitePageParser(conference_parser.conferences)
# get all the speaker and person lists
all_known_speaker = []
all_found_persons = []
for c in conference_parser.conferences:
for s in c.speakers:
if != "":
found_speaker_list = [ for person in all_found_persons if person.is_speaker]
non_speaker_list = [ for person in all_found_persons if
not person.is_speaker and person.is_speaker is not None]
# shows stats of the found speakers/persons
actual_speaker = len(all_known_speaker)
found_speaker = len(found_speaker_list)
found_non_speaker = len(non_speaker_list)
print('Actual Speaker: {}, Found Speaker: {}, Non Speaker found: {}'.format(actual_speaker,
recall, precision, true_negative_rate, accuracy = UtilityFunctions.calc_prec_recall(
print('Recall: {}, Precision: {}, True negative rate: {}, Accuracy: {}'.format(recall,
import json
from typing import List
from import Conference
from import ConferencePerson
class ConferenceDataParser:
A class for parsing a file into a list of conferences.
def __init__(self):
self.conferences: List[Conference] = []
def parse(self, file: str):
Loads a json file and creates a list of conferences from it.
:param file: the file to read from
self.conferences = []
with open(file) as f:
data = json.load(f)
for index, conference in enumerate(data):
newConference = Conference(
) for p in conference['speakers']]
import json
from typing import List, Tuple
from analysis.text import NameFinder, KeywordFinder
from import PageParser
from import Conference
from import ConferencePerson
from import ConferenceSinglePage
class ConferenceWebsitePageParser:
A class for parsing a file into a list of website pages.