Commit 6265edc3 authored by Orkut Karacalik's avatar Orkut Karacalik
Browse files

update

parent 3bbeecca
# H-Index of Conference Speakers
## Methodology
1. Select field in CS
2. Search for external links
- it can include conferences
3. Open webpage
- it can include conference links
- conference links w/ year
4. Look for links that includes keywords
- link-text includes keywords
- link-href includes keywords
- it can be same page
### Python Packages
### Python Requirements
1. scrapy
2. spacy
\ No newline at end of file
......@@ -22,3 +22,10 @@ def cretae_gender_field_df(df):
df = pd.read_json('../scrapers/o-5.json')
df_gender_field = cretae_gender_field_df(df)
df_gender_field_normalized = df_gender_field.apply(lambda col: col / col.sum() * 100 )
df_gender_field = df_gender_field.transpose().sort_values(by='female', ascending=False)
df_gender_field_normalized = df_gender_field_normalized.transpose().sort_values(by='female', ascending=True)
ax = df_gender_field_normalized.plot.barh(figsize=(12,16), grid=True)
ax.set(xlabel='Ratio', ylabel='Fields')
#df_ = pd.read_json('../scrapers/top-computer-science-conferences.json')
#df_ = pd.read_json('conferences.json')
df_h_index = df[df.h_index.notnull()]
This diff is collapsed.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import pandas as pd
df= pd.read_json("h-index.json")
df_with_h_index = df[df.h_index.notnull()]
women = df.groupby('gender').get_group("female")['h_index']
men = df.groupby('gender').get_group("male")['h_index']
......@@ -4,9 +4,14 @@
import pandas as pd
import spacy
import re
from functools import reduce
nlp = spacy.load('xx')
def cmp(a, b):
pairs = zip(a, b)
return any(x != y for x, y in pairs)
def find_names(text):
doc = nlp(text)
entries_person = filter(lambda x: x.label_ == 'PER', doc.ents)
......@@ -23,4 +28,18 @@ def process_text(pages):
df_speakers = pd.read_json('../scrapers/conference-speakers.json')
ans = df_speakers['speaker_pages'].apply(process_text)
df_speakers.speakers = df_speakers['speaker_pages'].apply(process_text)
df_speakers_real = pd.read_json('conferences.json')
df_speakers_real['nums'] = df_speakers_real['speakers'].apply(lambda x: len(x) if cmp(x, [{'name': '', 'gender': 'male', 'organization': ''}]) else 0)
conferences = df_speakers_real[df_speakers_real.nums > 0]
conferences.drop(['hasKeywords', 'website', 'speaker_urls'], inplace=True, axis=1)
conferences_ = df_speakers[df_speakers.name.isin(conferences.name)]
conferences_.drop(['speaker_pages', 'hasKeywords', 'website', 'speaker_urls'], inplace=True, axis=1)
conferences_.speakers = conferences_.speakers.apply(list)
grouped = conferences_.groupby(['name']) #.aggregate({ 'speakers': })
my_lambda = lambda x: reduce(lambda a,b: a+b, x)
output = grouped.aggregate({'speakers': my_lambda}).reset_index()
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# EXAM Q: shorthest path, how to compute closeness centrality
import pandas as pd
import w3lib
df_raw = pd.read_json('pages-1.json')
df = df_raw.drop(['hasKeywords', 'website'], axis=1)
df = df.rename(index=str, columns={"speaker_urls": "url", "speaker_pages": "page"})
index = 1
page = df.page[index]
url = df.url[index]
df.to_json('pages-cleaned-1.json', orient='records')
......@@ -22,11 +22,13 @@ class ConferenceSpeakersSpider(scrapy.Spider):
yield Request(url, self.parse, meta=dict(conference=conference))
def parse(self, response):
text = self.__remove_whitespace__(remove_tags(replace_escape_chars(strip_html5_whitespace(response.text))))
speakers = self.__find_names__(text)
page = response.text
# text = self.__remove_whitespace__(remove_tags(replace_escape_chars(strip_html5_whitespace(response.text))))
# speakers = self.__find_names__(text)
conference = response.meta['conference']
conference['speakers'].extend(speakers)
conference['speaker_pages'].append(text)
# conference['speakers'].extend(speakers)
conference['speaker_urls'] = response.url
conference['speaker_pages'] = page
yield conference
def __remove_whitespace__(self, text):
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment