Commit 104d2a5a authored by Orkut Karacalik's avatar Orkut Karacalik
Browse files

fixes

parent a3b19eab
.directory
scrapers/*.json
*.pyc
*.ini
*.json
*.txt
*.bat
*.sh
......
......@@ -11,4 +11,6 @@ df = pd.read_json('./conferences.json')
df['nums'] = df['speakers'].apply(lambda x: len(x) if cmp(x, [{'name': '', 'gender': '', 'organization': ''}]) else 0)
total_conferences = df[df.nums > 0].shape[0]
total_speakers = df.nums.sum()
\ No newline at end of file
total_speakers = df.nums.sum()
speakers = pd.read_json('../scrapers/o-4.json')
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -15,7 +15,7 @@ class HIndexSpider(scrapy.Spider):
]
def start_requests(self):
url = lambda name: f"https://scholar.google.com/citations?hl=en&view_op=search_authors&mauthors={'+'.join(name.split())}"
url = lambda name: f"https://scholar.google.com/citations?hl=en&view_op=search_authors&mauthors={'+'.join(name.split(' '))}"
with open('../data/conferences.json') as f:
data = json.load(f)
for conference in data:
......@@ -24,12 +24,16 @@ class HIndexSpider(scrapy.Spider):
yield Request(url(speaker['name']), self.parse, meta=dict(conference=conference, speaker=speaker))
def parse(self, response):
print(response.url)
names = response.css("span.gs_hlt::text").extract()
profile_links = response.css("h3.gsc_oai_name>a::attr(href)").extract()
print(response.meta['speaker']['name'], profile_links)
if len(names) > 0 and len(names) == len(profile_links):
query_similarity = list(map(lambda x: self.__get_similarity__(response.meta['speaker']['name'], x), names))
max_similar_name_link = profile_links[query_similarity.index(max(query_similarity))]
yield Request(response.urljoin(max_similar_name_link), self.parse_profile, meta=dict(conference=response.meta['conference'], speaker=response.meta['speaker']))
elif len(profile_links) == 1:
yield Request(response.urljoin(profile_links[0]), self.parse_profile, meta=dict(conference=response.meta['conference'], speaker=response.meta['speaker']))
def parse_profile(self, response):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment