Commit 1f204a82 authored by Orkut Karaçalık's avatar Orkut Karaçalık
Browse files

pipelines and main notebook update

parent 032f1641
This diff is collapsed.
This diff is collapsed.
......@@ -4,45 +4,66 @@
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from __future__ import absolute_import
from scrapy.exporters import JsonLinesItemExporter
from css.items import ComputerScienceConference
from css.items import AuthorItem
from scrapers.css.items import ComputerScienceConference
from scrapers.css.items import AuthorItem
import calendar
import time
import os
root = f'{os.path.dirname(os.path.realpath(__file__))}/../../..'
temp_file = lambda x: f'{root}/temp/{x}-{calendar.timegm(time.gmtime())}.json'
data_file = lambda x: f'{root}/data/{x}-{calendar.timegm(time.gmtime())}.json'
class ComputerScienceConferencePipeline(object):
def open_spider(self, spider):
if spider.name == 'computer-science-conferences':
self.file = open(f"top-{calendar.timegm(time.gmtime())}.json", "wb+")
else:
self.file = open(f"top-parsed-{calendar.timegm(time.gmtime())}.json", "wb+")
self.file.write(b'[')
self.file = open(temp_file('top-computer-science-conferences'), "wb+")
self.file.write(b'[')
elif spider.name == 'conference-website':
self.file = open(temp_file('conferences'), "wb+")
self.file.write(b'[')
def close_spider(self, spider):
self.file.seek(-1, os.SEEK_END)
self.file.truncate()
self.file.write(b']')
self.exporter.finish_exporting()
self.exporter.file.close()
self.file.close()
if spider.name in ['computer-science-conferences', 'conference-website']:
self.file.seek(-1, os.SEEK_END)
self.file.truncate()
self.file.write(b']')
self.exporter.finish_exporting()
self.exporter.file.close()
self.file.close()
def process_item(self, item, spider):
if not isinstance(item, ComputerScienceConference):
return item
self.exporter = JsonLinesItemExporter(self.file)
self.exporter.start_exporting()
self.exporter.export_item(item)
self.file.write(b',')
if spider.name in ['computer-science-conferences', 'conference-website']:
self.exporter = JsonLinesItemExporter(self.file)
self.exporter.start_exporting()
self.exporter.export_item(item)
self.file.write(b',')
return item
class AuthorItemPipeline(object):
def open_spider(self, spider):
if spider.name == 'h-index':
self.file = open(temp_file('h-index'), "wb+")
self.file.write(b'[')
def close_spider(self, spider):
if spider.name == 'h-index':
self.file.seek(-1, os.SEEK_END)
self.file.truncate()
self.file.write(b']')
self.exporter.finish_exporting()
self.exporter.file.close()
self.file.close()
def process_item(self, item, spider):
if not isinstance(item, AuthorItem):
return item
# Do something with Apple
if spider.name == 'h-index':
self.exporter = JsonLinesItemExporter(self.file)
self.exporter.start_exporting()
self.exporter.export_item(item)
self.file.write(b',')
return item
......@@ -66,6 +66,7 @@ CONCURRENT_REQUESTS = 32
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'css.pipelines.ComputerScienceConferencePipeline': 100,
'css.pipelines.AuthorItemPipeline': 200,
}
# Enable and configure the AutoThrottle extension (disabled by default)
......
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http.request import Request
from css.items import ComputerScienceConference
from scrapers.css.items import ComputerScienceConference
from w3lib.html import remove_tags, strip_html5_whitespace, replace_escape_chars
class ComputerScienceConferencesSpider(scrapy.Spider):
......
# -*- coding: utf-8 -*-
import json
import scrapy
import os
import json
from scrapy.http.request import Request
from scrapy.http import HtmlResponse
......@@ -8,16 +9,11 @@ class ConferenceWebsiteSpider(scrapy.Spider):
name = 'conference-website'
def start_requests(self):
with open('top-computer-science-conferences.json') as f:
with open(f'{os.path.dirname(os.path.realpath(__file__))}/../../../../data/top-computer-science-conferences.json') as f:
data = json.load(f)
for conference in data:
yield Request(conference['website'], self.parse, meta=dict(conference=conference))
# 1. it could be in current page
# 1.1. it could be within picked elements
# 2. it could in linked page/fragment
# 2.1 check hrefs w/ regex
# 2.2 text could be in descendant elements
def parse(self, response):
result = []
texts = response.xpath('//*[re:match(text(), "speaker|invited|keynote", "i")]/ancestor::*[@href]').extract()
......@@ -29,7 +25,7 @@ class ConferenceWebsiteSpider(scrapy.Spider):
conference = response.meta['conference']
conference['hasKeywords'] = result
conference['speaker_urls'] = list(set(self.__join_urls__(self.__extract_urls__(result), response)))
conference['home'] = response.text
# conference['home'] = response.text
yield conference
def __extract_urls__(self, items):
......
......@@ -2,7 +2,7 @@
import scrapy
from scrapy.http import HtmlResponse
from scrapy.http.request import Request
from css.items import ConferenceItem
from scrapers.css.items import ConferenceItem
from copy import deepcopy
class CsConferencesSpider(scrapy.Spider):
......@@ -33,7 +33,7 @@ class CsConferencesSpider(scrapy.Spider):
conferenceItem['name'] = htmlResponse.xpath("//a/text()").extract_first()
conferenceItem['link'] = response.urljoin(htmlResponse.xpath("//a/@href").extract_first())
yield Request(conferenceItem['link'], meta=dict(conferenceItem=conferenceItem), callback=self.parse_conferences)
def parse_conferences(self, response):
conferenceItem = response.meta['conferenceItem']
links = response.xpath("//div[@id='bodyContent']//a[starts-with(@href, 'http') and not(contains(@href, 'wikipedia')) ]/@href").extract()
......@@ -43,12 +43,6 @@ class CsConferencesSpider(scrapy.Spider):
conferenceItem['speaker_pages'] = []
conferenceItem['speakers'] = []
yield conferenceItem
def find_pages(self, response):
return
def parse_speakers(self, response):
return
def __convert_html__(self, item):
return HtmlResponse(url="https://en.wikipedia.org/wiki/List_of_computer_science_conferences", body=item, encoding="utf-8")
......
......@@ -4,8 +4,8 @@ import datetime
import json
import re
import unicodedata
from css.items import AuthorItem
import os
from scrapers.css.items import AuthorItem
from scrapy.http.request import Request
from difflib import SequenceMatcher
from scrapy.http import HtmlResponse
......@@ -17,6 +17,7 @@ from w3lib.html import remove_entities
class HIndexSpider(scrapy.Spider):
name = 'h-index'
custom_settings: {"ROBOTSTXT_OBEY": False}
allowed_domains = ['scholar.google.com']
start_urls = [
'https://scholar.google.com/citations?user=QvpcUn8AAAAJ&hl=en'
......@@ -25,11 +26,12 @@ class HIndexSpider(scrapy.Spider):
def start_requests(self):
def url(
name): return f"https://scholar.google.com/citations?hl=en&view_op=search_authors&mauthors={'+'.join(name.split(' '))}"
with open('../../data/conferences.json') as f:
with open(f'{os.path.dirname(os.path.realpath(__file__))}/../../../../data/conferences.json') as f:
data = json.load(f)
# with open('./sample-speakers.json') as f:
# data = json.load(f)
for conference in data:
for index, conference in enumerate(data):
# if index < 10: break
for speaker in conference['speakers']:
if len(speaker['name']) > 0:
# yield Request(url(speaker['name']), self.parse, meta=dict(conference=conference, speaker=speaker))
......@@ -74,7 +76,7 @@ class HIndexSpider(scrapy.Spider):
def parse_profile(self, response):
name = response.css("#gsc_prf_in::text").extract_first()
citation_table = response.css(".gsc_rsb_std::text").extract()
print(name, citation_table)
# print(name, citation_table)
if len(citation_table) > 0:
yield self.__yield_profile__(
response.meta['conference'], response.meta['speaker'], citation_table, response.url)
......
......@@ -6,10 +6,7 @@ from w3lib.html import remove_tags, strip_html5_whitespace, replace_escape_chars
import json
import spacy
import re
# DONE: load json file
# TODO: clean html response
# TODO: if not link found, search through whole pages (links should be cleaned)
# TODO: activate ner
class SpeakersSpider(scrapy.Spider):
name = 'speakers'
start_urls = ['http://www.ase2014.org/',
......@@ -19,7 +16,7 @@ class SpeakersSpider(scrapy.Spider):
def start_requests(self):
with open('conference-links-2.json') as f:
data = json.load(f)
for index, conference in enumerate(data):
for conference in enumerate(data):
yield Request(conference['website'], self.parse, meta=dict(conference=conference))
def parse(self, response):
......@@ -30,7 +27,7 @@ class SpeakersSpider(scrapy.Spider):
response.meta['conference']['speaker_urls'].append(nextPath)
yield Request(nextPath, meta=dict(conference=response.meta['conference']), callback=self.parse_names)
# otherwise look for all links
def parse_names(self, response):
text = self.__remove_whitespace__(remove_tags(
replace_escape_chars(strip_html5_whitespace(response.text))))
......@@ -44,7 +41,7 @@ class SpeakersSpider(scrapy.Spider):
def __remove_whitespace__(self, text):
return re.sub(' +|&nbsp|&nbsp;', ' ', text)
def __find_names__(self, text):
doc = self.nlp(text)
entries_person = filter(lambda x: x.label_ == 'PER', doc.ents)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment