Commit e386617a authored by Orkut Karaçalık's avatar Orkut Karaçalık
Browse files

fix-update h_index spider

parent d676c5b2
.directory
scrapers/*.json
src/scrapers/*.json
**/scrapers/*.json
*.pyc
*.ini
*.txt
......@@ -12,4 +11,5 @@ src/scrapers/*.json
*.prop
*.java
*.command
*.idea
\ No newline at end of file
*.idea
.ipynb_checkpoints/
\ No newline at end of file
......@@ -60,7 +60,7 @@ def get_distribution_plot(df, col='h_index', title='all'):
plt.show()
# TODO: make histogram
df = pd.read_json('../scrapers/o-7.json')
df = pd.read_json('../src/scrapers/o-8.json')
df_gender_field = create_gender_field_df(df).transpose()
df_gender_field['total'] = df_gender_field['male'] + df_gender_field['female']
df_gender_field_top = df_gender_field[df_gender_field.total > 40]
......
......@@ -13,4 +13,4 @@ df['nums'] = df['speakers'].apply(lambda x: len(x) if cmp(x, [{'name': '', 'gend
total_conferences = df[df.nums > 0].shape[0]
total_speakers = df.nums.sum()
speakers = pd.read_json('../scrapers/o-5.json')
speakers = pd.read_json('../src/scrapers/o-8.json')
......@@ -660,7 +660,7 @@
{
"name": "Ken Birman",
"gender": "male",
"organization": ""
"organization": "Cornell University"
},
{
"name": "Fred Chong",
......
......@@ -2,10 +2,23 @@
# -*- coding: utf-8 -*-
import pandas as pd
import unicodedata
df= pd.read_json("h-index.json")
def convert_ascii(name):
return (unicodedata.normalize('NFD', name).encode('ascii', 'ignore')).decode("utf-8")
df= pd.read_json("h-index.json")
df.name = df.name.apply(convert_ascii)
df_with_h_index = df[df.h_index.notnull()]
df_without_h_index = df[df.h_index.isnull()]
women = df.groupby('gender').get_group("female")['h_index']
men = df.groupby('gender').get_group("male")['h_index']
df_alt = pd.read_json("../src/scrapers/o-8.json")
df_alt_with_h_index = df_alt[df_alt.h_index.notnull()]
df_alt_without_h_index = df_alt[df_alt.h_index.isnull()]
df_non = df[~df.name.isin(df_alt.name)]
df_non_a = df_alt[~df_alt.name.isin(df.name)]
df_int =df_alt[df_alt.name.isin(df.name)]
This diff is collapsed.
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"ExecuteTime": {
"end_time": "2018-07-12T12:02:41.641340Z",
"start_time": "2018-07-12T12:02:41.630234Z"
}
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"ExecuteTime": {
"end_time": "2018-07-12T12:05:06.346877Z",
"start_time": "2018-07-12T12:05:06.343664Z"
}
},
"outputs": [],
"source": [
"# paths\n",
"path_data = '../data/'\n",
"path_figures = '../figures/'\n",
"path_output = '../output/'\n",
"path_temp = '../temp/'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"1. Scrape computer science conferences from guide2research.con"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"2. Parse conference homepages"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"3. Parse/Extract person names"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"4. Scrape h-index for each speaker"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"5. Male / Female ratio over time"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"6. Male / Femal ratio in 7 subfields"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"7. H-index distribution based on gender - Overall"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"8. H-index distribution based on gender - Top 4 speakers count"
]
},
{
"cell_type": "markdown",
"metadata": {
"ExecuteTime": {
"end_time": "2018-07-12T12:22:33.847814Z",
"start_time": "2018-07-12T12:22:33.843615Z"
}
},
"source": [
"9. KL Divergence"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"10. Conclusion"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
},
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}
......@@ -3,11 +3,18 @@ import scrapy
import datetime
import json
import re
import unicodedata
from css.items import AuthorItem
from scrapy.http.request import Request
from difflib import SequenceMatcher
from scrapy.http import HtmlResponse
from itertools import product
from jellyfish import damerau_levenshtein_distance
from functools import reduce
from w3lib.html import remove_entities
# TODO: get puppeteer
class HIndexSpider(scrapy.Spider):
name = 'h-index'
allowed_domains = ['scholar.google.com']
......@@ -16,65 +23,111 @@ class HIndexSpider(scrapy.Spider):
]
def start_requests(self):
url = lambda name: f"https://scholar.google.com/citations?hl=en&view_op=search_authors&mauthors={'+'.join(name.split(' '))}"
with open('../data/conferences.json') as f:
def url(
name): return f"https://scholar.google.com/citations?hl=en&view_op=search_authors&mauthors={'+'.join(name.split(' '))}"
with open('../../data/conferences.json') as f:
data = json.load(f)
# with open('./sample-speakers.json') as f:
# data = json.load(f)
for conference in data:
for speaker in conference['speakers']:
if len(speaker['name']) > 0:
yield Request(url(speaker['name']), self.parse, meta=dict(conference=conference, speaker=speaker))
# yield Request(url(speaker['name']), self.parse, meta=dict(conference=conference, speaker=speaker))
query = (unicodedata.normalize(
'NFD', speaker['name']).encode('ascii', 'ignore')).decode("utf-8")
yield Request(url(query), self.parse_query_result, meta=dict(conference=conference, speaker=speaker))
def parse(self, response):
print(response.url)
names = response.css("span.gs_hlt::text").extract()
profile_links = response.css("h3.gsc_oai_name>a::attr(href)").extract()
print(response.meta['speaker']['name'], profile_links)
if len(names) > 0 and len(names) == len(profile_links):
query_similarity = list(map(lambda x: self.__get_similarity__(response.meta['speaker']['name'], x), names))
max_similar_name_link = profile_links[query_similarity.index(max(query_similarity))]
yield Request(response.urljoin(max_similar_name_link), self.parse_profile, meta=dict(conference=response.meta['conference'], speaker=response.meta['speaker']))
elif len(profile_links) == 1:
yield Request(response.urljoin(profile_links[0]), self.parse_profile, meta=dict(conference=response.meta['conference'], speaker=response.meta['speaker']))
def parse_query_result(self, response):
items = response.css('div.gsc_oai').extract()
items_extracted = []
speaker_name = response.meta['speaker']['name']
speaker_gender = response.meta['speaker']['gender']
speaker_organization = response.meta['speaker']['organization']
conference_fields = response.meta['conference']['fields']
for item in items:
item_html = HtmlResponse(url="item", body=item, encoding="utf-8")
name = self.__get_author_name__(item_html.xpath(
"//a[span[contains(@class, 'gs_hlt')]]").extract()) # item <-> item
name_score = self.__calculate_match_score__([speaker_name], [name])
profile_link = item_html.css(
"h3.gsc_oai_name>a::attr(href)").extract_first()
organization = self.__get_organization_name__(item_html.css(
"div.gsc_oai_aff").extract_first()) # arr <-> item
organization_score = self.__calculate_match_score__(
[speaker_organization, self.__get_organization_abbreviation(speaker_organization)], [organization])
fields = item_html.css(
"div.gsc_oai_int > a.gsc_oai_one_int::text").extract() # arr <-> arr
fields_score = self.__calculate_match_score__(conference_fields, map(
lambda x: (x.encode('ascii', 'ignore')).decode("utf-8"), fields))
overall_score = self.__compute_weighted_match_score(
[(name_score, .33), (organization_score, .33), (fields_score, .33)])
items_extracted.append(dict(
name=name, profile_link=profile_link, organization=organization, fields=fields, gender=speaker_gender, score=overall_score))
# print(f'-> {overall_score} {name} {organization}')
if len(items_extracted) > 0:
speaker = max(items_extracted, key=lambda x: x['score'])
# print(speaker)
yield Request(response.urljoin(speaker['profile_link']), self.parse_profile, meta=dict(conference=response.meta['conference'], speaker=speaker))
else:
now = datetime.datetime.now()
conference = response.meta['conference']
speaker = response.meta['speaker']
item = AuthorItem()
item['conference_name'] = conference['name']
item['conference_fields'] = conference['fields']
item['gender'] = speaker['gender']
item['organization'] = speaker['organization']
item['date'] = re.search(r'(20\d{2})', conference['date']).group() #now.strftime("%Y-%m-%d")
item['name'] = speaker['name']
item['citations'] = None
item['citations_last_5_year'] = None
item['h_index'] = None
item['h_index_lat_5_year'] = None
item['google_scholar_profile'] = None
yield item
yield self.__yield_profile__(response.meta['conference'], response.meta['speaker'])
def parse_profile(self, response):
now = datetime.datetime.now()
name = response.css("#gsc_prf_in::text").extract_first()
citation_table = response.css(".gsc_rsb_std::text").extract()
print(name, citation_table)
if len(citation_table) > 0:
conference = response.meta['conference']
speaker = response.meta['speaker']
item = AuthorItem()
item['conference_name'] = conference['name']
item['conference_fields'] = conference['fields']
item['gender'] = speaker['gender']
item['organization'] = speaker['organization']
item['date'] = re.search(r'(20\d{2})', conference['date']).group() # now.strftime("%Y-%m-%d")
item['name'] = name
item['citations'] = citation_table[0]
item['citations_last_5_year'] = citation_table[1]
item['h_index'] = citation_table[2]
item['h_index_lat_5_year'] = citation_table[3]
item['google_scholar_profile'] = response.url
yield item
yield self.__yield_profile__(
response.meta['conference'], response.meta['speaker'], citation_table, response.url)
def __get_author_name__(self, tags):
for tag in tags:
anchor = HtmlResponse(url="name", body=tag, encoding="utf-8")
names = anchor.xpath(
"//a/span[contains(@class, 'gs_hlt')]/text() | //a/text()").extract()
name_full = ''.join(names).replace('-', ' ')
return (name_full.encode('ascii', 'ignore')).decode("utf-8") if len(tags) > 0 else ''
def __get_organization_name__(self, tag):
if tag:
div = HtmlResponse(url="name", body=tag, encoding="utf-8")
text = div.xpath(
"//div/span[contains(@class, 'gs_hlt')]/text() | //div[contains(@class, 'gsc_oai_aff')]/text()").extract()
text_full = ' '.join(map(str.strip, text))
return (text_full.encode('ascii', 'ignore')).decode("utf-8")
else:
return ''
def __get_organization_abbreviation(self, name):
return re.sub('[^A-Z]', '', name)
def __calculate_match_score__(self, queries, retrievals):
pairs = list(product(queries, retrievals))
scores = []
for pair in pairs:
# print(pair)
s = SequenceMatcher(None, pair[0].replace(
'-', ' '), pair[1]).ratio()
dl = damerau_levenshtein_distance(
pair[0].replace('-', ' '), pair[1]) + 1
# print(
# f"------> {pair[0].replace('-', ' ')} {pair[1]} {s} {dl} {s/dl} ")
scores.append(s / dl)
return 0 if len(scores) < 1 else max(scores)
def __compute_weighted_match_score(self, scores_with_weights):
return reduce(lambda acc, pair: acc + pair[0] * pair[1], scores_with_weights, 0)
def __get_similarity__(self, a, b):
return SequenceMatcher(None, a, b).ratio()
def __yield_profile__(self, conference, speaker, citation_table=None, url=None):
author = AuthorItem()
author['conference_name'] = conference['name']
author['conference_fields'] = conference['fields']
author['date'] = re.search(r'(20\d{2})', conference['date']).group()
author['gender'] = speaker['gender']
author['organization'] = speaker['organization']
author['name'] = speaker['name']
author['citations'] = None if citation_table is None else citation_table[0]
author['citations_last_5_year'] = None if citation_table is None else citation_table[1]
author['h_index'] = None if citation_table is None else citation_table[2]
author['h_index_lat_5_year'] = None if citation_table is None else citation_table[3]
author['google_scholar_profile'] = url
return author
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment