Commit 3bbeecca authored by Orkut Karacalik's avatar Orkut Karacalik
Browse files

update and new data

parent f9aa352d
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import pandas as pd
from functools import reduce
def get_unique_fields(df, col='conference_fields'):
return list(set(reduce(lambda x, y: x + y , df[col].tolist())))
def cretae_gender_field_df(df):
genders = df.gender.unique()
fields = get_unique_fields(df)
df_gender_field = pd.DataFrame(0, index=genders, columns=fields)
for g in genders:
for f in fields:
df_gender_field.loc[g][f] = df[(df.conference_fields.apply(lambda x: f in x)) & (df.gender == g)].shape[0]
return df_gender_field
# TODO: make histogram
df = pd.read_json('../scrapers/o-5.json')
df_gender_field = cretae_gender_field_df(df)
df_gender_field_normalized = df_gender_field.apply(lambda col: col / col.sum() * 100 )
......@@ -13,4 +13,4 @@ df['nums'] = df['speakers'].apply(lambda x: len(x) if cmp(x, [{'name': '', 'gend
total_conferences = df[df.nums > 0].shape[0]
total_speakers = df.nums.sum()
speakers = pd.read_json('../scrapers/o-4.json')
speakers = pd.read_json('../scrapers/o-5.json')
......@@ -4977,9 +4977,17 @@
"website": "https://www2018.thewebconf.org/",
"speakers": [
{
"name": "",
"name": "Luciano Floridi",
"gender": "male",
"organization": ""
"organization": "University of Oxford"
}, {
"name": "Lorrie Faith Cranor",
"gender": "female",
"organization": "Carnegie Mellon University "
}, {
"name": "Ruhi Sarikaya",
"gender": "male",
"organization": "Amazon Alexa team"
}
]
},
......@@ -4999,9 +5007,25 @@
"website": "http://web3d2018.web3d.org/submissions/",
"speakers": [
{
"name": "",
"name": "Patrick Bourdot",
"gender": "male",
"organization": ""
"organization": "EuroVR Association"
}, {
"name": "Dieter W. Fellner",
"gender": "male",
"organization": "TU Darmstadt"
}, {
"name": "Mariano Luis Alcañiz Raya",
"gender": "male",
"organization": "Polytechnic University of Valencia "
}, {
"name": "Daniel Thalmann",
"gender": "male",
"organization": " EPFL, Switzerland & Miralab Sarl, Switzerland"
}, {
"name": "Neil Trevett",
"gender": "male",
"organization": "NVIDIA"
}
]
},
......@@ -5029,9 +5053,57 @@
"website": "http://www.wi-opt.org/",
"speakers": [
{
"name": "",
"name": "Giuseppe Caire",
"gender": "male",
"organization": ""
"organization": "Technical University of Berlin"
}, {
"name": "Sherman Shen",
"gender": "male",
"organization": "University of Waterloo"
}, {
"name": "Junshan Zhang",
"gender": "male",
"organization": "Arizona State University"
}, {
"name": "Jun Zhang",
"gender": "male",
"organization": "Hong Kong University of Science and Technology"
}, {
"name": "Quanyan Zhu",
"gender": "male",
"organization": "New York University"
}, {
"name": "Liang Xiao",
"gender": "female",
"organization": "Xiamen University, China"
}, {
"name": "Haijun Zhang",
"gender": "male",
"organization": "University of Science and Technology Beijing"
}, {
"name": "Edmund Yeh",
"gender": "male",
"organization": "Northeastern University"
}, {
"name": "Stratis Ioannidis",
"gender": "male",
"organization": "Northeastern University"
}, {
"name": "Ness B. Shroff",
"gender": "male",
"organization": "Ohio State University"
}, {
"name": "Mikael Johansson",
"gender": "male",
"organization": "Professor of Electrical Engineering, KTH"
}, {
"name": "Song Chong",
"gender": "male",
"organization": "Korea Advanced Institute of Science and Technology (KAIST)"
}, {
"name": "Minghua Chen",
"gender": "male",
"organization": "The Chinese University of Hong Kong"
}
]
},
......@@ -5045,9 +5117,17 @@
"website": "http://i3dsymposium.github.io/2018/",
"speakers": [
{
"name": "",
"name": "Turner Whitted ",
"gender": "male",
"organization": ""
"organization": "University of North Carolina State University"
}, {
"name": "Aras Pranckevičius",
"gender": "male",
"organization": "Unity Technologies"
}, {
"name": "Daniel Holden",
"gender": "male",
"organization": "Ubisoft"
}
]
}
......
......@@ -36,6 +36,7 @@ class AuthorItem(Item):
h_index = Field()
citations_last_5_year = Field()
h_index_lat_5_year = Field()
google_scholar_profile = Field()
class ComputerScienceConference(Item):
name = Field()
......
......@@ -34,6 +34,23 @@ class HIndexSpider(scrapy.Spider):
yield Request(response.urljoin(max_similar_name_link), self.parse_profile, meta=dict(conference=response.meta['conference'], speaker=response.meta['speaker']))
elif len(profile_links) == 1:
yield Request(response.urljoin(profile_links[0]), self.parse_profile, meta=dict(conference=response.meta['conference'], speaker=response.meta['speaker']))
else:
now = datetime.datetime.now()
conference = response.meta['conference']
speaker = response.meta['speaker']
item = AuthorItem()
item['conference_name'] = conference['name']
item['conference_fields'] = conference['fields']
item['gender'] = speaker['gender']
item['organization'] = speaker['organization']
item['date'] = now.strftime("%Y-%m-%d")
item['name'] = speaker['name']
item['citations'] = None
item['citations_last_5_year'] = None
item['h_index'] = None
item['h_index_lat_5_year'] = None
item['google_scholar_profile'] = None
yield item
def parse_profile(self, response):
......@@ -55,6 +72,7 @@ class HIndexSpider(scrapy.Spider):
item['citations_last_5_year'] = citation_table[1]
item['h_index'] = citation_table[2]
item['h_index_lat_5_year'] = citation_table[3]
item['google_scholar_profile'] = response.url
yield item
def __get_similarity__(self, a, b):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment