Commit 8e9a5128 authored by Orkut Karaçalık's avatar Orkut Karaçalık
Browse files

Merge branch 'master' of gitlab.uni-koblenz.de:okaracalik/h-index-conf-speakers

parents 2793e9a9 96a4502d
name: binder_env
channels:
- defaults
dependencies:
- _tflow_180_select=3.0=eigen
- absl-py=0.2.2=py36_0
- asn1crypto=0.24.0=py36_0
- astor=0.6.2=py36_1
- atomicwrites=1.1.5=py36_0
- attrs=18.1.0=py36_0
- automat=0.7.0=py36_0
- backcall=0.1.0=py36_0
- beautifulsoup4=4.6.0=py36_1
- blas=1.0=mkl
- bleach=1.5.0=py36_0
- ca-certificates=2018.03.07=0
- certifi=2018.4.16=py36_0
- cffi=1.11.5=py36h9745a5d_0
- chardet=3.0.4=py36_1
- constantly=15.1.0=py36h28b3542_0
- cryptography=2.2.2=py36h14c3975_0
- cssselect=1.0.3=py36_0
- cycler=0.10.0=py36_0
- cymem=1.31.2=py36h6bb024c_0
- cytoolz=0.9.0.1=py36h14c3975_1
- dbus=1.13.2=h714fa37_1
- decorator=4.3.0=py36_0
- dill=0.2.8.2=py36_0
- expat=2.2.5=he0dffb1_0
- fontconfig=2.13.0=h9420a91_0
- freetype=2.9.1=h8a8886c_0
- gast=0.2.0=py36_0
- glib=2.56.1=h000015b_0
- grpcio=1.12.1=py36hdbcaa40_0
- gst-plugins-base=1.14.0=hbbd80ab_1
- gstreamer=1.14.0=hb453b48_1
- html5lib=0.9999999=py36_0
- hyperlink=18.0.0=py36_0
- icu=58.2=h9c2bf20_1
- idna=2.7=py36_0
- incremental=17.5.0=py36_0
- intel-openmp=2018.0.3=0
- ipykernel=4.8.2=py36_0
- ipython=6.4.0=py36_1
- ipython_genutils=0.2.0=py36_0
- jedi=0.12.1=py36_0
- jpeg=9b=h024ee3a_2
- jupyter_client=5.2.3=py36_0
- jupyter_core=4.4.0=py36_0
- kiwisolver=1.0.1=py36hf484d3e_0
- libedit=3.1.20170329=h6b74fdf_2
- libffi=3.2.1=hd88cf55_4
- libgcc-ng=7.2.0=hdf63c60_3
- libgfortran-ng=7.2.0=hdf63c60_3
- libpng=1.6.34=hb9fc6fc_0
- libprotobuf=3.5.2=h6f1eeef_0
- libsodium=1.0.16=h1bed415_0
- libstdcxx-ng=7.2.0=hdf63c60_3
- libuuid=1.0.3=h1bed415_2
- libxcb=1.13=h1bed415_1
- libxml2=2.9.8=h26e45fe_1
- libxslt=1.1.32=h1312cb7_0
- lxml=4.2.2=py36hf71bdeb_0
- markdown=2.6.11=py36_0
- matplotlib=2.2.2=py36hb69df0a_2
- mkl=2018.0.3=1
- mkl_fft=1.0.2=py36h651fb7a_0
- mkl_random=1.0.1=py36h4414c95_1
- more-itertools=4.2.0=py36_0
- msgpack-numpy=0.4.3=py36_0
- msgpack-python=0.5.6=py36h6bb024c_0
- murmurhash=0.28.0=py36hf484d3e_0
- ncurses=6.1=hf484d3e_0
- numpy=1.14.5=py36h1b885b7_4
- numpy-base=1.14.5=py36hdbf6ddf_4
- openssl=1.0.2o=h20670df_0
- pandas=0.23.3=py36h04863e7_0
- parsel=1.4.0=py36_0
- parso=0.3.1=py36_0
- patsy=0.5.0=py36_0
- pcre=8.42=h439df22_0
- pexpect=4.6.0=py36_0
- pickleshare=0.7.4=py36_0
- pip=10.0.1=py36_0
- plac=0.9.6=py36_0
- pluggy=0.6.0=py36_0
- preshed=1.0.0=py36hf484d3e_0
- prompt_toolkit=1.0.15=py36_0
- protobuf=3.5.2=py36hf484d3e_1
- ptyprocess=0.6.0=py36_0
- py=1.5.4=py36_0
- pyasn1=0.4.3=py36_0
- pyasn1-modules=0.2.2=py36_0
- pycparser=2.18=py36_1
- pydispatcher=2.0.5=py36_0
- pygments=2.2.0=py36_0
- pyopenssl=18.0.0=py36_0
- pyparsing=2.2.0=py36_1
- pyqt=5.9.2=py36h22d08a2_0
- pysocks=1.6.8=py36_0
- pytest=3.6.3=py36_0
- pytest-runner=4.2=py36_0
- python=3.6.6=hc3d631a_0
- python-dateutil=2.7.3=py36_0
- pytz=2018.5=py36_0
- pyzmq=17.0.0=py36h14c3975_3
- qt=5.9.6=h52aff34_0
- queuelib=1.5.0=py36_0
- readline=7.0=ha6073c6_4
- regex=2018.06.21=py36h14c3975_0
- requests=2.19.1=py36_0
- scipy=1.1.0=py36hc49cb51_0
- scrapy=1.5.0=py36_0
- seaborn=0.8.1=py36_0
- service_identity=17.0.0=py36h28b3542_0
- setuptools=39.2.0=py36_0
- simplegeneric=0.8.1=py36_2
- sip=4.19.8=py36hf484d3e_0
- six=1.11.0=py36_1
- spacy=2.0.11=py36h04863e7_2
- sqlite=3.24.0=h84994c4_0
- statsmodels=0.9.0=py36h035aef0_0
- tensorboard=1.8.0=py36hf484d3e_0
- tensorflow=1.8.0=h57681fa_0
- tensorflow-base=1.8.0=py36h5f64886_0
- termcolor=1.1.0=py36_1
- thinc=6.11.2=py36hedc7406_1
- tk=8.6.7=hc745277_3
- toolz=0.9.0=py36_0
- tornado=5.0.2=py36h14c3975_0
- tqdm=4.23.4=py36_0
- traitlets=4.3.2=py36_0
- twisted=17.5.0=py36_0
- ujson=1.35=py36h14c3975_0
- urllib3=1.23=py36_0
- w3lib=1.19.0=py36_0
- wcwidth=0.1.7=py36_0
- werkzeug=0.14.1=py36_0
- wheel=0.31.1=py36_0
- wrapt=1.10.11=py36h14c3975_2
- xz=5.2.4=h14c3975_4
- zeromq=4.2.5=hf484d3e_0
- zlib=1.2.11=ha838bed_2
- zope=1.0=py36_0
- zope.interface=4.5.0=py36h14c3975_0
- pip:
- gender-ai==0.1
prefix: /home/julian/miniconda3/envs/binder_env
python -m spacy download xx
from typing import List
from bs4 import BeautifulSoup
from bs4.element import Comment
......@@ -33,7 +35,101 @@ class PageParser:
"""
texts = self.soup.findAll(text=True)
visible_texts = filter(PageParser.tag_visible, texts)
return u" ".join(t.strip() for t in visible_texts)
links = self.soup.findAll('a')
images = self.soup.find_all('img')
headers = self.soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])
text = ""
for link in links:
if link.string:
text += " " + link.string.strip()
for img in images:
text += " " + img.get('alt', '').strip()
for h in headers:
text += " " + h.text.strip()
return text + u" ".join(t.strip() for t in visible_texts)
def name_stats(self, names: List[str]):
name_stats = dict()
speaker_name_stats = dict()
links = self.soup.findAll('a')
images = self.soup.find_all('img')
headers = self.soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])
for name in names:
name_stats[name] = [0, 0, 0]
speaker_name_stats[name] = [0, 0, 0]
for link in links:
if link.srting:
if name.lower() == link.string.strip().lower():
name_stats[name][0] += 1
for image in images:
if image.get('alt', ''):
if name.lower() == image.get('alt', '').strip().lower():
name_stats[name][1] += 1
for header in headers:
if header.text:
if name.lower() == header.text.strip().lower():
name_stats[name][2] += 1
for h in self.soup.find_all("div"):
parent_speaker_div = False
if h.parent:
header = [head.text for head in h.parent.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])]
for head in header:
if head.find("speaker") or head.find("Speaker"):
parent_speaker_div = True
if parent_speaker_div:
speaker_div = False
header = [head.text for head in h.find_all(["h1", "h2", "h3", "h4", "h5", "h6"], recursive=False)]
for head in header:
if head.find("speaker") or head.find("Speaker"):
speaker_div = True
if speaker_div:
links = h.findAll('a')
images = h.find_all('img')
headers = h.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])
for name in names:
for link in links:
if link.srting:
if name.lower() == link.string.strip().lower():
speaker_name_stats[name][0] += 1
for image in images:
if image.get('alt', ''):
if name.lower() == image.get('alt', '').strip().lower():
speaker_name_stats[name][1] += 1
for header in headers:
if header.text:
if name.lower() == header.text.strip().lower():
speaker_name_stats[name][2] += 1
"""child_speaker_div = False
children = h.findChildren(recursive=False)
for child in children:
header = [head.text for head in child.find_all(["h1", "h2", "h3", "h4", "h5", "h6"], recursive=False)]
for head in header:
if head.find("speaker") or head.find("Speaker"):
child_speaker_div = True
if child_speaker_div:
children = h.findChildren(recursive=False)
for child in children:
links = child.findAll('a')
images = child.find_all('img')
headers = child.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])
print(child.text)
for name in names:
for link in links:
if name == link.string:
speaker_name_stats[name][0] += 1
for image in images:
if name == image.get('alt', ''):
speaker_name_stats[name][1] += 1
for header in headers:
if name == header.text:
speaker_name_stats[name][2] += 1"""
return name_stats, speaker_name_stats
def image_count(self):
"""
......
......@@ -45,7 +45,7 @@ class Conference:
conf_p = []
for sp in self.sub_pages:
conf_sp_p = []
for n, o in sp.name_list:
for n in sp.name_list:
is_speaker = None
speakers_set = True
for s in self.speakers:
......@@ -65,7 +65,13 @@ class Conference:
new_conf_p = ConferencePerson(name=n,
gender=gender,
conference_name=self.name,
name_occurrence=o,
name_occurrence=sp.name_occurrence[n],
name_link_occurrence=sp.name_stats[n][0],
name_image_occurrence=sp.name_stats[n][1],
name_header_occurrence=sp.name_stats[n][2],
speaker_name_link_occurrence=sp.speaker_name_stats[n][0],
speaker_name_image_occurrence=sp.speaker_name_stats[n][1],
speaker_name_header_occurrence=sp.speaker_name_stats[n][2],
is_speaker=is_speaker)
conf_sp_p.append(new_conf_p)
except:
......
......@@ -8,6 +8,12 @@ class ConferencePerson:
conference_name: str,
name_occurrence: int = 0,
word_count_occurrence: int = 0,
name_link_occurrence: int = 0,
name_image_occurrence: int = 0,
name_header_occurrence: int = 0,
speaker_name_link_occurrence: int = 0,
speaker_name_image_occurrence: int = 0,
speaker_name_header_occurrence: int = 0,
is_speaker: bool = None):
"""
Initializes the class with some optional variables.
......@@ -25,6 +31,12 @@ class ConferencePerson:
# features
self.name_occurrence: int = name_occurrence
self.word_count_occurrence: int = word_count_occurrence
self.name_link_occurrence: int = name_link_occurrence
self.name_image_occurrence: int = name_image_occurrence
self.name_header_occurrence: int = name_header_occurrence
self.speaker_name_link_occurrence: int = speaker_name_link_occurrence
self.speaker_name_image_occurrence: int = speaker_name_image_occurrence
self.speaker_name_header_occurrence: int = speaker_name_header_occurrence
# label
self.is_speaker: bool = is_speaker
\ No newline at end of file
......@@ -10,7 +10,10 @@ class ConferenceSinglePage:
conference,
url: str,
speaker_page: bool,
name_list: List[Tuple[str, int]],
name_list: List[str],
name_occurrence: Dict[str,int],
name_stats: Dict[str, List],
speaker_name_stats: Dict[str, List],
image_count: int,
word_count: int,
organization_count: int,
......@@ -34,7 +37,10 @@ class ConferenceSinglePage:
self.conference = conference
self.url: str = url
self.speaker_page: bool = speaker_page
self.name_list: List[Tuple[str, int]] = name_list
self.name_list: List[str] = name_list
self.name_occurrence: Dict[str, int] = name_occurrence
self.name_stats: Dict[str, List] = name_stats
self.speaker_name_stats: Dict[str, List] = speaker_name_stats
# additional info
self.image_count: int = image_count
......
This diff is collapsed.
This diff is collapsed.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This diff is collapsed.
......@@ -8,41 +8,52 @@ from handler.conference_data_handler import ConferenceDataHandler
from handler.website_page_data_parser import ConferenceWebsitePageParser
# parse the data files
print("Read conference data...")
conferences = ConferenceDataHandler.parse("data/conferences.json")
filter_words = ConferenceDataHandler.get_filter_words(conferences)
filter_words.extend(["biography", "department", "university", "–", "abstract", "keynote", "bio"])
print(filter_words)
print("Read speaker website data...")
website_parser = ConferenceWebsitePageParser(conferences, filter_words)
website_parser.parse("data/pages.json")
# get all the speaker and person lists
all_found_persons = ConferenceDataHandler.find_persons(conferences)
speaker_list = ConferenceDataHandler.get_speaker(conferences)
found_speaker_list = [person.name for person in all_found_persons if person.is_speaker]
not_found_speaker_list = [s for s in speaker_list if s not in found_speaker_list]
non_speaker_list = [person.name for person in all_found_persons if
not person.is_speaker and person.is_speaker is not None]
found_speaker_list = []
not_found_speaker_list = []
non_speaker_list = []
for found_persons, speaker_list in zip(all_found_persons, speaker_list):
add_found_speaker = [person.name.lower() for person in found_persons if person.is_speaker]
found_speaker_list.extend(add_found_speaker)
non_speaker_list.extend([person.name.lower() for person in found_persons if
not person.is_speaker and person.is_speaker is not None])
not_found_speaker_list.extend(
[speaker.lower() for speaker in speaker_list if speaker.lower() not in add_found_speaker])
print(speaker_list)
print(not_found_speaker_list)
print(found_speaker_list)
print(non_speaker_list)
# ....
train_person_list = [person for person in all_found_persons if person.is_speaker is not None]
f = open("data/speaker_train_data.json", "w")
# create train data
train_person_list = []
for person_list in all_found_persons:
train_person_list.extend([person for person in person_list if person.is_speaker is not None])
f = open("data/01_speaker_train_data.json", "w")
s = json.dumps([person.__dict__ for person in train_person_list])
f.write("%s\n" % s)
# .......
predict_person_list = [person for person in all_found_persons if person.is_speaker is None]
f = open("data/speaker_predict_data.json", "w")
# create predict data
predict_person_list = []
for person_list in all_found_persons:
predict_person_list.extend([person for person in person_list if person.is_speaker is None])
f = open("data/01_speaker_predict_data.json", "w")
s = json.dumps([person.__dict__ for person in predict_person_list])
f.write("%s\n" % s)
# shows stats of the found speakers/persons
actual_speaker = len(speaker_list)
actual_speaker = len(found_speaker_list) + len(not_found_speaker_list)
found_speaker = len(found_speaker_list)
found_non_speaker = len(non_speaker_list)
print('Actual Speaker: {}, Found Speaker: {}, Non Speaker found: {}'.format(actual_speaker,
......
......@@ -36,7 +36,7 @@ class ConferenceDataHandler:
return conferences
@staticmethod
def get_speaker(conferences: List[Conference]) -> List[str]:
def get_speaker(conferences: List[Conference]) -> List[List[str]]:
"""
Search every speaker of the conference.
:param conferences: the conferences to search from
......@@ -44,13 +44,15 @@ class ConferenceDataHandler:
"""
speakers = []
for c in conferences:
add_speaker = []
for s in c.speakers:
if s.name != "":
speakers.append(s.name)
add_speaker.append(s.name)
speakers.append(add_speaker)
return speakers
@staticmethod
def find_persons(conferences: List[Conference]) -> List[ConferencePerson]:
def find_persons(conferences: List[Conference]) -> List[List[ConferencePerson]]:
"""
Search every person mentioned on the conference speaker websites.
:param conferences: the conferences to search from
......@@ -58,7 +60,7 @@ class ConferenceDataHandler:
"""
persons = []
for c in conferences:
persons.extend(c.get_conference_persons())
persons.append(c.get_conference_persons())
return persons
@staticmethod
......
This diff is collapsed.
model_checkpoint_path: "model.ckpt-10000"
all_model_checkpoint_paths: "model.ckpt-10000"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment