Commit 4b7ed143 authored by Orkut Karaçalık's avatar Orkut Karaçalık
Browse files

ner scripts and notebook update

parent 1f204a82
.directory
**/scrapers/*.json
**/temp/*.json
*.pyc
*.ini
*.txt
......
......@@ -3,19 +3,3 @@
### Python Requirements
1. scrapy
2. spacy
### ToDo
1. Do men get invited more often than women with the same level of success?
- *kl-divergence over distributions
2. Do men recurrently get invited more often than women from the same field?
- *see if there is significant difference in the mean of men and women
3. Is there a growth in the number of women speakers at conferences?
- *compare ratio between male and female speakers
4. Presentation
- No code on slides
- *Research Qs and Found Answers/Explanations
- *Takeaway points from project
- *Results <--> Related Work
5. Graphs
- *Previous-Presentation-Fig.1: make it better
- *Previous-Presentation-Fig.2: make it for each subfield
This diff is collapsed.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import pandas as pd
def cmp(a, b):
pairs = zip(a, b)
return any(x != y for x, y in pairs)
df = pd.read_json('./conferences.json')
df['nums'] = df['speakers'].apply(lambda x: len(x) if cmp(x, [{'name': '', 'gender': 'male', 'organization': ''}]) else 0)
total_conferences = df[df.nums > 0].shape[0]
total_speakers = df.nums.sum()
speakers = pd.read_json('../src/scrapers/o-8.json')
This diff is collapsed.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import json
import pandas as pd
import re
from functools import reduce
from scrapy.http import HtmlResponse
def hasHref(items):
return reduce(lambda x, y: x or y, map(lambda i: bool(re.search(r'href=', i)), items))
def hasScript(items):
return reduce(lambda x, y: x or y, map(lambda i: bool(re.search(r'script', i)), items))
def remove_non_hrefs(items):
return filter(lambda x: bool(re.search(r'script', x)) and bool(re.search(r'href=', x)), items)
def getNoHrefCount(df):
return df[df.hasHref == False].shape
def convert_html(items):
return list(map(lambda item: HtmlResponse(url="a", body=item, encoding="utf-8").xpath('//*/@href').extract_first(), items))
#with open('../scrapers/top-computer-science-conferences-keywords-2.json') as f:
# conferences = json.load(f)
#df_1 = pd.read_json('../scrapers/top-computer-science-conferences-keywords-1.json')
#df_1['hasHref'] = df_1['hasKeywords'].apply(hasHref)
df_new = pd.read_json('../scrapers/top-computer-science-conferences-keywords-6.json')
df_new['isOnOtherPage'] = df_new['hasKeywords'].apply(hasHref)
#diff_1 = df_1[~df_1.name.isin(df_new.name)]
#diff_2 = df_new[~df_new.name.isin(df_1.name)]
df_speakers = pd.read_json('../scrapers/conference-speakers.json')
df = df_speakers.drop(['speaker_pages', 'speakers'], axis=1)
df = df.groupby('name').first().reset_index()
df['speakers'] = pd.Series([[{"name": "", "organization": ""}]]*df.shape[0])
#df.to_json('conferencessds.json', orient='records')
dfa = pd.read_json('./conferences.json')
#df.assign(speakers = lambda x: pd.Series([{"name": "", "organization": ""}]))
#df['fields'] = df['fields'].astype(str)
#df['hasKeywords'] = df['hasKeywords'].astype(str)
#df['speaker_urls'] = df['speaker_urls'].astype(str)
#df.drop_duplicates(inplace=True)
#df['fields'] = df['fields'].astype(object)
#df['hasKeywords'] = df['hasKeywords'].astype(object)
#df['speaker_urls'] = df['speaker_urls'].astype(object)
#ans = df_speakers['speakers'].apply(pd.Series)
#df_speakers.groupby(['date', 'fields', 'name', 'website']).agg({ 'speakers': lambda x: ' '.join(x) })
#df_filtered = df_new[df_new.hasHref]
#ans = df_new['hasKeywords'].apply(convert_html)
#def remove_whitespace(text):
# return re.sub(' +|&nbsp|&nbsp;', ' ', text)
#
#def get_persons(entries):
# entries_person = filter(lambda x: x.label_ == 'PER', entries)
# return list(map(lambda x: x.text, entries_person))
#
#flatten = lambda l: [item for sublist in l for item in sublist]
#with open('speakers-method-2-2.json') as f:
# speakers_method = json.load(f)
#speakers = reduce(lambda x, y: x + y,
# list(map(lambda x: x['speakers'], speakers_method)))
#speakers = list(map(lambda x: { "a": x['field'], "names": flatten(x['speakers']) }, speakers_method))
#links = reduce(lambda x, y: x + y,
# list(map(lambda x: [x['website']], conference_links)))
#links.sort()
#page = remove_whitespace(speakers_method_1[0]['speaker_pages'][0])
#nlp = spacy.load('xx')
#doc = nlp(page)
#
#for ent in doc.ents:
# if ent.label_== 'PER': print(type(ent.label_), ent.label_, ent.text, ent.start_char, ent.end_char)
This diff is collapsed.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import time
import calendar
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import numpy as np
from scipy.stats import entropy
import scrapy.crawler
from functools import reduce
from twisted.internet import reactor
from multiprocessing import Process, Queue
def get_root_dir():
return f'{os.path.dirname(os.path.realpath(__file__))}/..'
def generate_temp_filename(name):
return f'{get_root_dir()}/temp/{name}-{calendar.timegm(time.gmtime())}.json'
def get_temp_file(name):
return f'{get_root_dir()}/temp/{name}.json'
def generate_data_filename(name):
return f'{get_root_dir()}/data/{name}-{calendar.timegm(time.gmtime())}.json'
def get_data_file(name):
return f'{get_root_dir()}/data/{name}.json'
def run_spider(spider, settings):
def f(q):
try:
runner = scrapy.crawler.CrawlerRunner(settings)
deferred = runner.crawl(spider)
deferred.addBoth(lambda _: reactor.stop())
reactor.run()
q.put(None)
except Exception as e:
q.put(e)
q = Queue()
p = Process(target=f, args=(q,))
p.start()
result = q.get()
p.join()
if result is not None:
raise result
def get_unique_fields(df, col='conference_fields'):
init_list = lambda x: x if type(x) is list else [x]
......@@ -16,7 +55,6 @@ def get_unique_fields(df, col='conference_fields'):
else:
return df[col].tolist()
def create_gender_field_df(df):
genders = df.gender.unique()
fields = get_unique_fields(df)
......@@ -59,53 +97,5 @@ def get_distribution_plot(df, col='h_index', title='all'):
ax.legend(handles=patches)
plt.show()
# TODO: make histogram
df = pd.read_json('../src/scrapers/o-8.json')
df_gender_field = create_gender_field_df(df).transpose()
df_gender_field['total'] = df_gender_field['male'] + df_gender_field['female']
df_gender_field_top = df_gender_field[df_gender_field.total > 40]
df_gender_field_top['female_ratio'] = df_gender_field_top['female'] / df_gender_field_top['total'] * 100
df_gender_field_top['male_ratio'] = df_gender_field_top['male'] / df_gender_field_top['total'] * 100
df_gender_field_top = df_gender_field_top.sort_values(by='female_ratio')
ax = df_gender_field_top.loc[:,['female_ratio', 'male_ratio']].plot.barh(figsize=(8,6), grid=True)
plt.yticks(fontsize=18)
#df_gender_field_normalized = df_gender_field_top.apply(lambda col: col / col.sum() * 100 )
#df_gender_field = df_gender_field.transpose().sort_values(by='female', ascending=False)
#df_gender_field_normalized = df_gender_field_normalized.transpose().sort_values(by='female', ascending=True)
#ax = df_gender_field_normalized.plot.barh(figsize=(12,16), grid=True)
#ax.set(xlabel='Ratio', ylabel='Fields')
#df_ = pd.read_json('../scrapers/top-computer-science-conferences.json')
#df_ = pd.read_json('conferences.json')
df_h_index = df[df.h_index.notnull()]
fields = get_unique_fields(df)
years = get_unique_fields(df, 'date')
df_h_index_per_fields = group_by_fields(df_h_index)
df_h_index_per_year = group_by_years(df_h_index)
df_h_index_per_year_per_fields = group_by_years_fields(df_h_index, years, fields) # { y: { f: df_h_index_per_year[y][df_h_index_per_year[y].conference_fields.apply(lambda x: f in x)] for f in fields } for y in df_h_index_per_year.keys() }
#get_distribution_plot(df_h_index)
#get_distribution_plot(df_h_index)
#get_distribution_plot(df_h_index_per_year)
#get_distribution_plot(df_h_index_per_year_per_fields)
#a = get_unique_fields(df_h_index_per_year_per_fields[2017]['human-computer-interaction'], 'gender')
#female = df_h_index.groupby('gender').get_group('female')['h_index']
#male = df_h_index.groupby('gender').get_group('male')['h_index']
#pmf_female, bins_female = np.histogram(female, density=True)
#pmf_male, bins_male = np.histogram(male, density=True)
#dist_f = dict(zip(bins_female, pmf_female))
#dist_m = dict(zip(bins_male, pmf_male))
#
#answer = entropy(pmf_female, pmf_male)
#def find_female_male(df):
# return { 'female': df[df.gender == 'female'].shape[0], 'male': df[df.gender == 'male'].shape[0] }
#
#ratio_per_year = { y: find_female_male(df_h_index_per_year[y]) for y in df_h_index_per_year.keys() }
#plt.pie(ratio_per_year[2017].values(), explode = (0.1, 0), labels=ratio_per_year[2017].keys(), colors=['gold', 'yellowgreen'], autopct='%1.1f%%', shadow=True)
#plt.axis('equal')
#plt.show()
#plt.pie(ratio_per_year[2018].values(), explode = (0.1, 0), labels=ratio_per_year[2018].keys(), colors=['gold', 'yellowgreen'], autopct='%1.1f%%', shadow=True)
#plt.axis('equal')
#plt.show()
def find_female_male(df):
return { 'female': df[df.gender == 'female'].shape[0], 'male': df[df.gender == 'male'].shape[0] }
\ No newline at end of file
This diff is collapsed.
......@@ -5,6 +5,7 @@ import pandas as pd
import spacy
import re
from functools import reduce
from helper import get_data_file
nlp = spacy.load('xx')
......@@ -16,7 +17,7 @@ def find_names(text):
doc = nlp(text)
entries_person = filter(lambda x: x.label_ == 'PER', doc.ents)
return list(map(lambda x: x.text, entries_person))
r = re.compile('([A-Z]\w+(?=[\s\-][A-Z])(?:[\s\-][A-Z]\w+)+)', re.UNICODE)
def process_text(pages):
......@@ -26,20 +27,21 @@ def process_text(pages):
matches = no_multiple_space.apply(lambda text: ' '.join(r.findall(text)))
return matches.apply(find_names)
df_speakers = pd.read_json('../scrapers/conference-speakers.json')
df_speakers.speakers = df_speakers['speaker_pages'].apply(process_text)
df_speakers_real = pd.read_json('conferences.json')
df_speakers_real['nums'] = df_speakers_real['speakers'].apply(lambda x: len(x) if cmp(x, [{'name': '', 'gender': 'male', 'organization': ''}]) else 0)
conferences = df_speakers_real[df_speakers_real.nums > 0]
conferences.drop(['hasKeywords', 'website', 'speaker_urls'], inplace=True, axis=1)
conferences_ = df_speakers[df_speakers.name.isin(conferences.name)]
conferences_.drop(['speaker_pages', 'hasKeywords', 'website', 'speaker_urls'], inplace=True, axis=1)
conferences_.speakers = conferences_.speakers.apply(list)
grouped = conferences_.groupby(['name']) #.aggregate({ 'speakers': })
my_lambda = lambda x: reduce(lambda a,b: a+b, x)
output = grouped.aggregate({'speakers': my_lambda}).reset_index()
def run():
df_speakers = pd.read_json(get_data_file('pages-cleaned'))
df_speakers.speakers = df_speakers['page'].apply(process_text)
df_speakers_real = pd.read_json(get_data_file('conferences'))
df_speakers_real['nums'] = df_speakers_real['speakers'].apply(lambda x: len(x) if cmp(x, [{'name': '', 'gender': 'male', 'organization': ''}]) else 0)
conferences = df_speakers_real[df_speakers_real.nums > 0]
conferences.drop(['speaker_urls'], inplace=True, axis=1)
conferences_ = df_speakers[df_speakers.name.isin(conferences.name)]
conferences_.drop(['page'], inplace=True, axis=1)
conferences_.speakers = conferences_.speakers.apply(list)
grouped = conferences_.groupby(['name'])
my_lambda = lambda x: reduce(lambda a,b: a+b, x)
output = grouped.aggregate({'speakers': my_lambda}).reset_index()
return output
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# EXAM Q: shorthest path, how to compute closeness centrality
import pandas as pd
import re
import w3lib
from scrapy.http import HtmlResponse
from difflib import SequenceMatcher, Differ
from helper import generate_temp_filename, get_data_file
def convert_html(text):
return HtmlResponse(url="a", body=text, encoding="utf-8")
def pre_process(text):
# convert to html
text_1 = max(convert_html(text).xpath('//body').extract(), key=len)
text_1 = max(convert_html(str.encode(text)).xpath('//body').extract(), key=len)
# remove comments
text_2 = w3lib.html.remove_comments(text_1, encoding="utf-8")
# remove script
......@@ -24,20 +24,12 @@ def pre_process(text):
text_5 = re.sub(' +|&nbsp|&nbsp;', ' ', text_4)
return text_5
df_raw = pd.read_json('pages-2.json')
df = df_raw.drop(['hasKeywords', 'website'], axis=1)
df = df.rename(index=str, columns={"speaker_urls": "url", "speaker_pages": "page"})
#df['lenPre'] = df.page.apply(lambda x: len(x))
df.page = df.page.apply(pre_process)
df.home = df.home.apply(pre_process)
#df['lenPost'] = df.page.apply(lambda x: len(x))
#df['reduce'] = (df.lenPre - df.lenPost) / df.lenPre * 100
#df['sim'] =
#df.to_json('pages-cleaned-1.json', orient='records')
index = 137
home = df.home[index]
page = df.page[index]
url = df.url[index]
page_old = df_raw.speaker_pages[index]
def run():
df_raw = pd.read_json(get_data_file('pages'))
df = df_raw.drop(['hasKeywords', 'website'], axis=1)
df = df.rename(index=str, columns={"speaker_urls": "url", "speaker_pages": "page"})
df.page = df.page.apply(pre_process)
df.home = df.home.apply(pre_process)
df.to_json(generate_temp_filename('pages-cleaned'), orient='records')
# -*- coding: utf-8 -*-
import json
import re
import spacy
import os
import scrapy
from scrapy.http.request import Request
from functools import reduce
from w3lib.html import remove_tags, strip_html5_whitespace, replace_escape_chars
class ConferenceSpeakersSpider(scrapy.Spider):
name = 'conference-speakers'
nlp = spacy.load('xx')
# 1. process html items: parse, urljoin, duplicates
# 2. make request
# 3. ner
def start_requests(self):
with open('top-computer-science-conferences-keywords-7.json') as f:
with open(f'{os.path.dirname(os.path.realpath(__file__))}/../../../../data/conferences.json') as f:
data = json.load(f)
for conference in data:
for url in conference['speaker_urls']:
......@@ -23,18 +16,7 @@ class ConferenceSpeakersSpider(scrapy.Spider):
def parse(self, response):
page = response.text
# text = self.__remove_whitespace__(remove_tags(replace_escape_chars(strip_html5_whitespace(response.text))))
# speakers = self.__find_names__(text)
conference = response.meta['conference']
# conference['speakers'].extend(speakers)
conference['speaker_urls'] = response.url
conference['speaker_pages'] = page
yield conference
def __remove_whitespace__(self, text):
return re.sub(' +|&nbsp|&nbsp;', ' ', text)
def __find_names__(self, text):
doc = self.nlp(text)
entries_person = filter(lambda x: x.label_ == 'PER', doc.ents)
return list(map(lambda x: x.text, entries_person))
......@@ -5,6 +5,7 @@ from scrapy.http.request import Request
from scrapers.css.items import ConferenceItem
from copy import deepcopy
# This spider is deprecated, because it was used for wikipedia seed list and later we switched off another seed list.
class CsConferencesSpider(scrapy.Spider):
name = 'cs-conferences'
allowed_domains = ['en.wikipedia.org']
......
......@@ -7,6 +7,7 @@ import json
import spacy
import re
# This spider is deprecated, because it was used for wikipedia seed list and later we switched off another seed list.
class SpeakersSpider(scrapy.Spider):
name = 'speakers'
start_urls = ['http://www.ase2014.org/',
......
### Temporary files
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment