Commit 63ec519c authored by Julian Rogawski's avatar Julian Rogawski
Browse files

Added projekt for automatically drawing speakers from data of conferences

parent 6265edc3
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="Python 3.6 (julian-0HhYIt6F)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="TestRunnerService">
<option name="PROJECT_TEST_RUNNER" value="Unittests" />
</component>
</module>
\ No newline at end of file
[[source]]
url = "https://pypi.python.org/simple"
verify_ssl = true
name = "pypi"
[packages]
spacy = "*"
"beautifulsoup4" = "*"
[dev-packages]
[requires]
python_version = "3.6"
{
"_meta": {
"hash": {
"sha256": "eb4da93af39ac3bb0d5339ad958e2ebb395fb138014ed668f6f7d2114393956e"
},
"pipfile-spec": 6,
"requires": {
"python_version": "3.6"
},
"sources": [
{
"name": "pypi",
"url": "https://pypi.python.org/simple",
"verify_ssl": true
}
]
},
"default": {
"beautifulsoup4": {
"hashes": [
"sha256:11a9a27b7d3bddc6d86f59fb76afb70e921a25ac2d6cc55b40d072bd68435a76",
"sha256:7015e76bf32f1f574636c4288399a6de66ce08fb7b2457f628a8d70c0fbabb11",
"sha256:808b6ac932dccb0a4126558f7dfdcf41710dd44a4ef497a0bb59a77f9f078e89"
],
"index": "pypi",
"version": "==4.6.0"
},
"cymem": {
"hashes": [
"sha256:00bb3645dfb9a020d735ba3d6f822b04656388180588d8b2cebde967ee678bcc",
"sha256:0dd61d05977839a922c0d797c355b98949210575918b1743b41e38ae9fb2c3a7",
"sha256:4bc1056b52d959fcbb1e0f32ec84fa131754d6be1e36b65782c6ac86419f4bf3",
"sha256:4c5d9ca6ec706792b8d9b1faf6db77b95545c388c768b21d940f197aa7efbb7e",
"sha256:50292f4dd0d950a8698bae27d71efe59da7ff08e591b735e08b658aae42c4745",
"sha256:616d06333f46dd03c128d97912d361183fc02249e6420a7b7907b41214c51562",
"sha256:944af97d4d34a2470b5199f1c31d2dfc79cdec7bd7a41354d839a8ab87fdfaa6",
"sha256:b38056efb99078b06c504adb5f03a8d9e822a5543451737b746028a71c4b1ac3",
"sha256:b6513b2926c60d641f159e79e6fb16460dfb50ebcce31a5af0370c51837c7efc",
"sha256:daa6003fcc199752ab703142021cff74774872a932303b240dc0ea177adf295d",
"sha256:f06d9b50da0474d7405674d8101c319d89a17d33792d6d429fe3d5c64f0d9df1"
],
"version": "==1.31.2"
},
"cytoolz": {
"hashes": [
"sha256:476a2ad176de5eaef80499b7b43d4f72ba6d23df33d349088dae315e9b31c552"
],
"version": "==0.8.2"
},
"dill": {
"hashes": [
"sha256:624dc244b94371bb2d6e7f40084228a2edfff02373fe20e018bef1ee92fdd5b3"
],
"version": "==0.2.8.2"
},
"msgpack-numpy": {
"hashes": [
"sha256:6947df61826a2917e38dbe07957a0c70dc82dce93ec38153dae850fdd21a4583",
"sha256:afc603c7cf8497fb125a8c8c713518a004e9662101f088e3d4fcf7688b08eeb3"
],
"version": "==0.4.1"
},
"msgpack-python": {
"hashes": [
"sha256:378cc8a6d3545b532dfd149da715abae4fda2a3adb6d74e525d0d5e51f46909b"
],
"version": "==0.5.6"
},
"murmurhash": {
"hashes": [
"sha256:651137ed3e1169342c9edade454f3beb7fcdf28d4ad1ac232725237eaf442d9a"
],
"version": "==0.28.0"
},
"numpy": {
"hashes": [
"sha256:07379fe0b450f6fd6e5934a9bc015025bb4ce1c8fbed3ca8bef29328b1bc9570",
"sha256:085afac75bbc97a096744fcfc97a4b321c5a87220286811e85089ae04885acdd",
"sha256:2d6481c6bdab1c75affc0fc71eb1bd4b3ecef620d06f2f60c3f00521d54be04f",
"sha256:2df854df882d322d5c23087a4959e145b953dfff2abe1774fec4f639ac2f3160",
"sha256:381ad13c30cd1d0b2f3da8a0c1a4aa697487e8bb0e9e0cbeb7439776bcb645f8",
"sha256:385f1ce46e08676505b692bfde918c1e0b350963a15ef52d77691c2cf0f5dbf6",
"sha256:4d278c2261be6423c5e63d8f0ceb1b0c6db3ff83f2906f4b860db6ae99ca1bb5",
"sha256:51c5dcb51cf88b34b7d04c15f600b07c6ccbb73a089a38af2ab83c02862318da",
"sha256:589336ba5199c8061239cf446ee2f2f1fcc0c68e8531ee1382b6fc0c66b2d388",
"sha256:5edf1acc827ed139086af95ce4449b7b664f57a8c29eb755411a634be280d9f2",
"sha256:6b82b81c6b3b70ed40bc6d0b71222ebfcd6b6c04a6e7945a936e514b9113d5a3",
"sha256:6c57f973218b776195d0356e556ec932698f3a563e2f640cfca7020086383f50",
"sha256:758d1091a501fd2d75034e55e7e98bfd1370dc089160845c242db1c760d944d9",
"sha256:8622db292b766719810e0cb0f62ef6141e15fe32b04e4eb2959888319e59336b",
"sha256:8b8dcfcd630f1981f0f1e3846fae883376762a0c1b472baa35b145b911683b7b",
"sha256:97fa8f1dceffab782069b291e38c4c2227f255cdac5f1e3346666931df87373e",
"sha256:9d69967673ab7b028c2df09cae05ba56bf4e39e3cb04ebe452b6035c3b49848e",
"sha256:9e1f53afae865cc32459ad211493cf9e2a3651a7295b7a38654ef3d123808996",
"sha256:a4a433b3a264dbc9aa9c7c241e87c0358a503ea6394f8737df1683c7c9a102ac",
"sha256:baadc5f770917ada556afb7651a68176559f4dca5f4b2d0947cd15b9fb84fb51",
"sha256:c725d11990a9243e6ceffe0ab25a07c46c1cc2c5dc55e305717b5afe856c9608",
"sha256:d696a8c87315a83983fc59dd27efe034292b9e8ad667aeae51a68b4be14690d9",
"sha256:e1864a4e9f93ddb2dc6b62ccc2ec1f8250ff4ac0d3d7a15c8985dd4e1fbd6418"
],
"version": "==1.14.5"
},
"pathlib": {
"hashes": [
"sha256:6940718dfc3eff4258203ad5021090933e5c04707d5ca8cc9e73c94a7894ea9f"
],
"version": "==1.0.1"
},
"plac": {
"hashes": [
"sha256:854693ad90367e8267112ffbb8955f57d6fdeac3191791dc9ffce80f87fd2370",
"sha256:ba3f719a018175f0a15a6b04e6cc79c25fd563d348aacd320c3644d2a9baf89b"
],
"version": "==0.9.6"
},
"preshed": {
"hashes": [
"sha256:a6b3a9e34634600e3e410ec25e0debed4b65a47eb37514a063d189d1c425b4dd"
],
"version": "==1.0.0"
},
"regex": {
"hashes": [
"sha256:19c4b0f68dd97b7116e590f47d60d97ab9e76966acc321b1d20dd87c2b64dff2",
"sha256:1af6b820bec5ca82af87447af5a6dcc23b3ddc96b0184fd71666be0c24fb2a4f",
"sha256:232dbc28a2562d92d713c3c1eb2b9276f3ebcbdb6d3e96ff68d0417a71926784",
"sha256:3d26ce7e605a501509b68c343fc9d9e09f76c2e9e261df8183027bdc750c97ce",
"sha256:52b590a41b9677314d02d9055edc33992db758b3d5167aa1365229a6a0c26a6d",
"sha256:565f9aac9cd43b2351f7fcbc0d6056f8aebf4f6d049a17982085019ab9acdf28",
"sha256:656984899644d3fe2e40533724f513a21127f77162a15dd5244af3c965152c63",
"sha256:689c9d17c3ba02f52e8481a5c584c8c11ba27d6cc5f939efdd838ae0d0d1af41",
"sha256:8a9d9db8ef1621ae51ea12acb5e503204b4586e05c6cfd418aecb9466a71bd87",
"sha256:ad2beea450d551b11b47512ce920127d7c8645e528cc56dc9502c5973e8732f3",
"sha256:b39867f577bc59b2fec9209facc513c761978e4ac63f4b73b9750a2c1501729e",
"sha256:b6a7725a069be8f9dd09e1e500e5b57556b301942e21c8c712627f73ec048286",
"sha256:b9e9b97696e75e826adac1920b13e7bac3a6a2128c085783abd208d73a278d70",
"sha256:bf4896ed1ca2017153fc6b341bc8a0da8ca5480f85eebd7bfe58bbafceb4e728",
"sha256:c3c2fe1e0d90f4c93be5b588480f05defd44f64c65767a657de69c4db4429a39",
"sha256:d811874ed669165fe1059a54f860db5c6ab5f48100bf4945d915fd2f877b2531",
"sha256:db616380b04e29e5709bc3ec0674e827dfed3d18e7d686c09537ab01506127c9",
"sha256:efa66273b49dbd7a9f6a4d02d1a7d5bf353d568a89f7cd8927812daa9f83bb84",
"sha256:f8feab5b517cdc65a61a50549e7dcfa0f61ab872a0034da1f6b8d61775178b6a"
],
"version": "==2017.4.5"
},
"six": {
"hashes": [
"sha256:70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9",
"sha256:832dc0e10feb1aa2c68dcc57dbb658f1c7e65b9b61af69048abc87a2db00a0eb"
],
"version": "==1.11.0"
},
"spacy": {
"hashes": [
"sha256:cddb06e7965222e4339eb59d2258db8dadab19ef8b0a1a44a2d33f94935ba421"
],
"index": "pypi",
"version": "==2.0.11"
},
"termcolor": {
"hashes": [
"sha256:1d6d69ce66211143803fbc56652b41d73b4a400a2891d7bf7a1cdf4c02de613b"
],
"version": "==1.1.0"
},
"thinc": {
"hashes": [
"sha256:9a1deb850285f76efaf0ae38b605a137a3978826282cc57dcc1e66b779402a76"
],
"version": "==6.10.2"
},
"toolz": {
"hashes": [
"sha256:929f0a7ea7f61c178bd951bdae93920515d3fbdbafc8e6caf82d752b9b3b31c9"
],
"version": "==0.9.0"
},
"tqdm": {
"hashes": [
"sha256:224291ee0d8c52d91b037fd90806f48c79bcd9994d3b0abc9e44b946a908fccd",
"sha256:77b8424d41b31e68f437c6dd9cd567aebc9a860507cb42fbd880a5f822d966fe"
],
"version": "==4.23.4"
},
"ujson": {
"hashes": [
"sha256:f66073e5506e91d204ab0c614a148d5aa938bdbf104751be66f8ad7a222f5f86"
],
"version": "==1.35"
},
"wrapt": {
"hashes": [
"sha256:d4d560d479f2c21e1b5443bbd15fe7ec4b37fe7e53d335d3b9b0a7b1226fe3c6"
],
"version": "==1.10.11"
}
},
"develop": {}
}
import spacy
class NameFinder:
"""
A class for text analysis, for name finding.
"""
def __init__(self):
self.nlp = spacy.load('xx_ent_wiki_sm') # loads this model for identifying names
def get_persons(self, text):
"""
Creates a list of person names from a text.
:param text: The text in which this functions searches for names.
:return: list of found person names.
"""
doc = self.nlp(text)
raw_persons = [entity.text for entity in doc.ents if entity.label_ == 'PER']
filtered_persons = list(filter(lambda p: len(p.split(' ')) > 1, raw_persons)) # make sure we get no single names
return filtered_persons
def get_organisations(self, text):
"""
Creates a list of organisation names from a text.
:param text: The text in which this functions searches for names.
:return: list of found organisation names.
"""
doc = self.nlp(text)
organisations = [entity.text for entity in doc.ents if entity.label_ == 'ORG']
return organisations
class KeywordFinder:
"""
A class for text analysis, for keyword finding.
"""
positive_keywords = ['Speaker',
'Invite']
neutral_keywords = [ 'Conference',
'Paper',
'University']
negative_keywords = ['Organize',
'Committee']
@staticmethod
def get_keyword_count(text: str):
"""
Counts and returns the number of found keywords.
:param text: The text in which this functions searches for keywords.
:return: dicts of keywords with number of occurrences for each category of keywords
"""
positive_keyword_counts = dict()
neutral_keyword_counts = dict()
negative_keyword_counts = dict()
for k in KeywordFinder.positive_keywords:
positive_keyword_counts[k] = text.lower().count(k.lower())
for k in KeywordFinder.neutral_keywords:
neutral_keyword_counts[k] = text.lower().count(k.lower())
for k in KeywordFinder.negative_keywords:
negative_keyword_counts[k] = text.lower().count(k.lower())
return positive_keyword_counts, neutral_keyword_counts, negative_keyword_counts
class UtilityFunctions:
"""
Class for useful analyse functions.
"""
@staticmethod
def calc_prec_recall(true_positive: int, false_positive: int, all_positives: int):
"""
Calculates recall, precision, the true negative rate and accuracy from input values.
:param true_positive: number of true positives
:param false_positive: number of false positives
:param all_positives: number of all positives
:return: recall, precision, the true negative rate, accuracy
"""
tp = true_positive
fp = false_positive
tn = 0
fn = all_positives - true_positive
recall = tp / (tp + fn)
precision = tp / (tp + fp)
true_negative_rate = tn/(tn + fp)
accuracy = (tp + tn)/(tp + tn + fp + fn)
return recall, precision, true_negative_rate, accuracy
\ No newline at end of file
from bs4 import BeautifulSoup
from bs4.element import Comment
class PageParser:
"""
Class for parsing pages of websites.
"""
def __init__(self, body):
"""
Initializes the parser with the html body of a page.
:param body: html body to parse
"""
self.soup = BeautifulSoup(body, 'html.parser')
@staticmethod
def tag_visible(element):
"""
Checks if an element is visible for visitors of the website.
:param element: the element which should be checked for visibility
:return: True if visible, False otherwise
"""
if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
return False
if isinstance(element, Comment):
return False
return True
def text_from_html(self):
"""
Filters out the complete visible text of a page.
:return: the filtered text from the page
"""
texts = self.soup.findAll(text=True)
visible_texts = filter(PageParser.tag_visible, texts)
return u" ".join(t.strip() for t in visible_texts)
def image_count(self):
"""
Counts the images of a page.
:return: the number of images on a page.
"""
return len(self.soup.find_all('img'))
\ No newline at end of file
from typing import List
from components.website.conference_person import ConferencePerson
from components.website.conference_single_page import ConferenceSinglePage
class Conference:
"""
Representative class of a conference for calculation purposes.
"""
def __init__(self,
name: str,
url: str,
speaker_urls: List[str],
field: str,
sub_fields: List[str],
date: str,
speakers: List[ConferencePerson]):
"""
Initializes the class.
:param name: the name of the conference
:param url: the url of the conference website
:param speaker_urls: the urls of the pages from the website, that has the speakers listet
:param field: the general field of research of this conference
:param sub_fields: the sub fields this conference covers
:param date: the date of the conference
:param speakers: the speakers of the conference
"""
self.name: str = name
self.url: str = url
self.speaker_urls: List[str] = speaker_urls
self.field: str = field
self.sub_fields: List[str] = sub_fields
self.date: str = date
self.speakers: List[ConferencePerson] = speakers
self.sub_pages: List[ConferenceSinglePage] = []
def get_conference_persons(self) -> List[ConferencePerson]:
"""
Creates a list of every person found on this conference's website with some statistics
:return: a list of found persons for this conference
"""
conf_p = []
for sp in self.sub_pages:
conf_sp_p = []
for n, o in sp.name_list:
is_speaker = None
speakers_set = True
for s in self.speakers:
if s.name == "":
speakers_set = False
if speakers_set:
is_speaker = False
for s in self.speakers:
if s.name == n:
is_speaker = True
# TODO calculate more statistics about the person from the conference page
new_conf_p = ConferencePerson(name=n,
gender="",
conference_name=self.name,
is_speaker=is_speaker)
conf_sp_p.append(new_conf_p)
# check for duplicates and update statistics of the persons
for cspp in conf_sp_p:
is_new_conf_p = True
for cp in conf_p:
if cp.name.lower() == cspp.name.lower():
is_new_conf_p = False
# TODO modify/calculate more statistics about the person from the conference page
if is_new_conf_p:
conf_p.append(cspp)
return conf_p
class ConferencePerson:
"""
A class for a person associated with a conference.
"""
def __init__(self,
name: str,
gender: str,
conference_name: str,
name_occurrence: int = 0,
word_count_occurrence: int = 0,
is_speaker: bool = None):
"""
Initializes the class with some optional variables.
:param name: the name of the person
:param gender: the gender of the person
:param conference_name: the name of the associated conference
:param name_occurrence: the number of occurrences of this person's name
:param word_count_occurrence: the number of words occurring along with this person's name
:param is_speaker: labels this person as a speaker at the conference (or not), None for unknown
"""
self.name: str = name
self.gender: str = gender
self.conference_name: str = conference_name
# features
self.name_occurrence: int = name_occurrence
self.word_count_occurrence: int = word_count_occurrence
# label
self.is_speaker: bool = is_speaker
\ No newline at end of file
from typing import List, Dict, Tuple
class ConferenceSinglePage:
"""
A Class that represents a sub page of the conference website.
It is used for storing and creating statistics about the page.
"""
def __init__(self,
conference,
url: str,
speaker_page: bool,
name_list: List[Tuple[str, int]],
image_count: int,
word_count: int,
organization_count: int,
positive_keyword_count: Dict[str, int],
neutral_keyword_count: Dict[str, int],
negative_keyword_count: Dict[str, int]):
"""
Initialises this class.
:param conference: the associated conference
:param url: the url of this page
:param speaker_page: states if this page is a page containing speaker
:param name_list: a list of person names, found on this page
:param image_count: the count of images on this page
:param word_count: the count of words shown for a user on this page
:param organization_count: the count of organization names appearing on this page
:param positive_keyword_count: the count of positive keywords found on this page
:param neutral_keyword_count: the count of neutral keywords found on this page
:param negative_keyword_count: the count of negative keywords found on this page
"""
# base info
self.conference = conference
self.url: str = url
self.speaker_page: bool = speaker_page
self.name_list: List[Tuple[str, int]] = name_list
# additional info
self.image_count: int = image_count
self.word_count: int = word_count
self.organization_count: int = organization_count
self.positive_keyword_count: Dict[str, int] = positive_keyword_count
self.neutral_keyword_count: Dict[str, int] = neutral_keyword_count
self.negative_keyword_count: Dict[str, int] = negative_keyword_count
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
from analysis.utility import UtilityFunctions
from parser.conference_data_parser import ConferenceDataParser
from parser.website_page_data_parser import ConferenceWebsitePageParser
# parse the data files
conference_parser = ConferenceDataParser()
conference_parser.parse("data/conferences.json")
website_parser = ConferenceWebsitePageParser(conference_parser.conferences)
website_parser.parse("data/pages.json")
# get all the speaker and person lists
all_known_speaker = []
all_found_persons = []
for c in conference_parser.conferences:
for s in c.speakers:
if s.name != "":
all_known_speaker.append(s)
all_found_persons.extend(c.get_conference_persons())
found_speaker_list = [person.name for person in all_found_persons if person.is_speaker]
non_speaker_list = [person.name for person in all_found_persons if
not person.is_speaker and person.is_speaker is not None]
# shows stats of the found speakers/persons
actual_speaker = len(all_known_speaker)
found_speaker = len(found_speaker_list)
found_non_speaker = len(non_speaker_list)
print('Actual Speaker: {}, Found Speaker: {}, Non Speaker found: {}'.format(actual_speaker,
found_speaker,
found_non_speaker))
recall, precision, true_negative_rate, accuracy = UtilityFunctions.calc_prec_recall(
found_speaker,
found_non_speaker,
actual_speaker
)
print('Recall: {}, Precision: {}, True negative rate: {}, Accuracy: {}'.format(recall,
precision,
true_negative_rate,
accuracy))
import json
from typing import List
from components.website.conference import Conference
from components.website.conference_person import ConferencePerson
class ConferenceDataParser:
"""
A class for parsing a file into a list of conferences.
"""
def __init__(self):
self.conferences: List[Conference] = []
def parse(self, file: str):
"""
Loads a json file and creates a list of conferences from it.
:param file: the file to read from
"""
self.conferences = []
with open(file) as f:
data = json.load(f)
for index, conference in enumerate(data):
newConference = Conference(
conference['name'],
conference['website'],
conference['speaker_urls'],
"",
conference['fields'],
conference['date'],
[ConferencePerson(p['name'],
p['gender'],
p['organization']
) for p in conference['speakers']]
)
self.conferences.append(newConference)
import json
from typing import List, Tuple
from analysis.text import NameFinder, KeywordFinder
from analysis.website import PageParser
from components.website.conference import Conference
from components.website.conference_person import ConferencePerson
from components.website.conference_single_page import ConferenceSinglePage