Commit 8f50bf93 authored by Orkut Karaçalık's avatar Orkut Karaçalık
Browse files

update

parent 891a6c72
{
"cells": [
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"ExecuteTime": {
"end_time": "2018-07-10T15:22:27.973870Z",
"start_time": "2018-07-10T15:22:27.968571Z"
}
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"import matplotlib.patches as mpatches\n",
"from functools import reduce"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"ExecuteTime": {
"end_time": "2018-07-10T15:22:27.985541Z",
"start_time": "2018-07-10T15:22:27.975829Z"
}
},
"outputs": [],
"source": [
"def get_unique_fields(df, col='conference_fields'):\n",
" init_list = lambda x: x if type(x) is list else [x]\n",
" if df[col].shape[0] > 1:\n",
" return list(set(reduce(lambda x, y: init_list(x) + init_list(y) , df[col])))\n",
" else:\n",
" return df[col].tolist()\n",
"\n",
"def group_by_fields(df, fields=None):\n",
" fields = get_unique_fields(df) if fields is None else fields\n",
" return { f: df[df.conference_fields.apply(lambda x: f in x)] for f in fields }\n",
"\n",
"def group_by_years(df, years=None):\n",
" years = get_unique_fields(df, 'date') if years is None else years\n",
" return { y: df[df.date == y] for y in years }\n",
"\n",
"def group_by_years_fields(df, years=None, fields=None):\n",
" fields = get_unique_fields(df) if fields is None else fields\n",
" years = get_unique_fields(df, 'date') if years is None else years\n",
" df_years = group_by_years(df, years)\n",
" return { y: group_by_fields(df_years[y], fields) for y in df_years.keys() }\n",
"\n",
"def get_distribution_plot(df, col='h_index', title='all'):\n",
" if type(df) is dict:\n",
" new_title = '' if title == 'all' else f\"{title} /\"\n",
" for key in df.keys():\n",
" get_distribution_plot(df[key], title=f\"{new_title} {key}\")\n",
" else:\n",
" if df.shape[0] > 0:\n",
" genders = get_unique_fields(df, 'gender')\n",
" colors = \"rbgcmykw\"\n",
" patches = []\n",
" for i, gender in enumerate(genders):\n",
" ax = sns.distplot(df.groupby('gender').get_group(gender)[col], hist=True, rug=True, color=colors[i], label=gender)\n",
" patches.append(mpatches.Patch(color=colors[i], label=gender))\n",
" ax.set_title(title)\n",
" ax.legend(handles=patches)\n",
" plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"ExecuteTime": {
"end_time": "2018-07-10T15:22:28.005283Z",
"start_time": "2018-07-10T15:22:27.987439Z"
}
},
"outputs": [],
"source": [
"df = pd.read_json('../scrapers/o-7.json')\n",
"df_h_index = df[df.h_index.notnull()]"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"ExecuteTime": {
"end_time": "2018-07-10T15:22:28.010977Z",
"start_time": "2018-07-10T15:22:28.007326Z"
}
},
"outputs": [],
"source": [
"fields = get_unique_fields(df)\n",
"years = get_unique_fields(df, 'date')"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"ExecuteTime": {
"end_time": "2018-07-10T15:22:28.073776Z",
"start_time": "2018-07-10T15:22:28.013995Z"
}
},
"outputs": [],
"source": [
"df_h_index_per_fields = group_by_fields(df_h_index)\n",
"df_h_index_per_year = group_by_years(df_h_index)\n",
"df_h_index_per_year_per_fields = group_by_years_fields(df_h_index, years, fields)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"ExecuteTime": {
"end_time": "2018-07-10T15:22:28.094450Z",
"start_time": "2018-07-10T15:22:28.077192Z"
}
},
"outputs": [
{
"ename": "UnboundLocalError",
"evalue": "local variable 'gender' referenced before assignment",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mUnboundLocalError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-36-024760dbbe6e>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mget_distribution_plot\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf_h_index\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m<ipython-input-32-4ffd4843b734>\u001b[0m in \u001b[0;36mget_distribution_plot\u001b[0;34m(df, col, title)\u001b[0m\n\u001b[1;32m 29\u001b[0m \u001b[0mgenders\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_unique_fields\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'gender'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 30\u001b[0m \u001b[0mcolors\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"rbgcmykw\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 31\u001b[0;31m \u001b[0mcolor\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcolors\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mgender\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 32\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgender\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgenders\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 33\u001b[0m \u001b[0max\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msns\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdistplot\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroupby\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'gender'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_group\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgender\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mcol\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhist\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrug\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolor\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcolors\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mgender\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mUnboundLocalError\u001b[0m: local variable 'gender' referenced before assignment"
]
}
],
"source": [
"get_distribution_plot(df_h_index)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2018-07-10T15:22:28.095405Z",
"start_time": "2018-07-10T15:22:27.985Z"
}
},
"outputs": [],
"source": [
"get_distribution_plot(df_h_index_per_year)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2018-07-10T15:22:28.097529Z",
"start_time": "2018-07-10T15:22:27.990Z"
}
},
"outputs": [],
"source": [
"get_distribution_plot(df_h_index_per_year_per_fields)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
},
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}
......@@ -2,12 +2,21 @@
# -*- coding: utf-8 -*-
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import numpy as np
from functools import reduce
def get_unique_fields(df, col='conference_fields'):
return list(set(reduce(lambda x, y: x + y , df[col].tolist())))
init_list = lambda x: x if type(x) is list else [x]
if df[col].shape[0] > 1:
return list(set(reduce(lambda x, y: init_list(x) + init_list(y) , df[col])))
else:
return df[col].tolist()
def cretae_gender_field_df(df):
def create_gender_field_df(df):
genders = df.gender.unique()
fields = get_unique_fields(df)
......@@ -18,14 +27,55 @@ def cretae_gender_field_df(df):
return df_gender_field
def group_by_fields(df, fields=None):
fields = get_unique_fields(df) if fields is None else fields
return { f: df[df.conference_fields.apply(lambda x: f in x)] for f in fields }
def group_by_years(df, years=None):
years = get_unique_fields(df, 'date') if years is None else years
return { y: df[df.date == y] for y in years }
def group_by_years_fields(df, years=None, fields=None):
fields = get_unique_fields(df) if fields is None else fields
years = get_unique_fields(df, 'date') if years is None else years
df_years = group_by_years(df, years)
return { y: group_by_fields(df_years[y], fields) for y in df_years.keys() }
def get_distribution_plot(df, col='h_index', title='all'):
if type(df) is dict:
new_title = '' if title == 'all' else f"{title} /"
for key in df.keys():
get_distribution_plot(df[key], title=f"{new_title} {key}")
else:
if df.shape[0] > 0:
genders = get_unique_fields(df, 'gender')
colors = "rbgcmykw"
patches = []
for i, gender in enumerate(genders):
ax = sns.distplot(df.groupby('gender').get_group(gender)[col], hist=True, rug=True, color=colors[i], label=gender)
patches.append(mpatches.Patch(color=colors[i], label=gender))
ax.set_title(title)
ax.legend(handles=patches)
plt.show()
# TODO: make histogram
df = pd.read_json('../scrapers/o-5.json')
df_gender_field = cretae_gender_field_df(df)
df_gender_field_normalized = df_gender_field.apply(lambda col: col / col.sum() * 100 )
df_gender_field = df_gender_field.transpose().sort_values(by='female', ascending=False)
df_gender_field_normalized = df_gender_field_normalized.transpose().sort_values(by='female', ascending=True)
ax = df_gender_field_normalized.plot.barh(figsize=(12,16), grid=True)
ax.set(xlabel='Ratio', ylabel='Fields')
df = pd.read_json('../scrapers/o-7.json')
#df_gender_field = create_gender_field_df(df)
#df_gender_field_normalized = df_gender_field.apply(lambda col: col / col.sum() * 100 )
#df_gender_field = df_gender_field.transpose().sort_values(by='female', ascending=False)
#df_gender_field_normalized = df_gender_field_normalized.transpose().sort_values(by='female', ascending=True)
#ax = df_gender_field_normalized.plot.barh(figsize=(12,16), grid=True)
#ax.set(xlabel='Ratio', ylabel='Fields')
#df_ = pd.read_json('../scrapers/top-computer-science-conferences.json')
#df_ = pd.read_json('conferences.json')
df_h_index = df[df.h_index.notnull()]
fields = get_unique_fields(df)
years = get_unique_fields(df, 'date')
df_h_index_per_fields = group_by_fields(df_h_index)
df_h_index_per_year = group_by_years(df_h_index)
df_h_index_per_year_per_fields = group_by_years_fields(df_h_index, years, fields) # { y: { f: df_h_index_per_year[y][df_h_index_per_year[y].conference_fields.apply(lambda x: f in x)] for f in fields } for y in df_h_index_per_year.keys() }
#get_distribution_plot(df_h_index)
get_distribution_plot(df_h_index_per_year)
#get_distribution_plot(df_h_index_per_year_per_fields)
#a = get_unique_fields(df_h_index_per_year_per_fields[2017]['human-computer-interaction'], 'gender')
\ No newline at end of file
This diff is collapsed.
......@@ -2,6 +2,7 @@
import scrapy
import datetime
import json
import re
from css.items import AuthorItem
from scrapy.http.request import Request
from difflib import SequenceMatcher
......@@ -43,7 +44,7 @@ class HIndexSpider(scrapy.Spider):
item['conference_fields'] = conference['fields']
item['gender'] = speaker['gender']
item['organization'] = speaker['organization']
item['date'] = now.strftime("%Y-%m-%d")
item['date'] = re.search(r'(20\d{2})', conference['date']).group() #now.strftime("%Y-%m-%d")
item['name'] = speaker['name']
item['citations'] = None
item['citations_last_5_year'] = None
......@@ -66,7 +67,7 @@ class HIndexSpider(scrapy.Spider):
item['conference_fields'] = conference['fields']
item['gender'] = speaker['gender']
item['organization'] = speaker['organization']
item['date'] = now.strftime("%Y-%m-%d")
item['date'] = re.search(r'(20\d{2})', conference['date']).group() # now.strftime("%Y-%m-%d")
item['name'] = name
item['citations'] = citation_table[0]
item['citations_last_5_year'] = citation_table[1]
......
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment