Commit 032f1641 authored by Orkut Karaçalık's avatar Orkut Karaçalık
Browse files

computer-science-conferences pipelin

parent 284af4c3
......@@ -4,42 +4,45 @@
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.exceptions import DropItem
from functools import reduce
import re
class ConferenceItemPipeline(object):
isPdf = lambda self, link: '.pdf' in link
isToc = lambda self, link: link.endswith('.toc') or link.endswith('.sg')
isUniTrier = lambda self, link: 'uni-trier.de' in link
isAcm = lambda self, link: 'acm.org' in link
isAcmPortal = lambda self, link: 'portal.acm' in link
isBlogspot = lambda self, link: 'blogspot' in link
isPs = lambda self, link: '.ps' in link
isSourceforge = lambda self, link: 'sourceforge' in link
isIeeExplore = lambda self, link: 'ieeexplore.ieee.org' in link
isIsni = lambda self, link: 'isni.org' in link
isOxfordJournal = lambda self, link: 'oxfordjournals' in link
isCambridgeJournal = lambda self, link: 'journals.cambridge' in link
isScienceDirect = lambda self, link: 'sciencedirect.com' in link
isSpringerLink = lambda self, link: 'springerlink' in link or 'link.springer' in link or 'springer.com' in link
isWikimedia = lambda self, link: 'wikimedia' in link
isDoi = lambda self, link: 'doi.org' in link
isGoogle = lambda self, link: 'google.com' in link or 'google.de' in link or 'google.it' in link
isTwitter = lambda self, link: 'twitter.com' in link
isLinkedin = lambda self, link: 'linkedin.com' in link
isYoutube = lambda self, link: 'youtube' in link
isJstor = lambda self, link: 'jstor.org' in link
isWorldCat = lambda self, link: 'worldcat.org' in link
isMicrosoft = lambda self, link: 'research.microsoft' in link
isIp = lambda self, link: bool(re.search(r'\d+\.\d+\.\d+\.\d+', link))
def __filter_links__(self, link):
return reduce(lambda x, y: x or y, list(map(lambda x: x(link), [self.isPdf, self.isToc, self.isUniTrier, self.isAcm, self.isAcmPortal, self.isBlogspot, self.isPs, self.isSourceforge, self.isIeeExplore, self.isIsni, self.isOxfordJournal, self.isCambridgeJournal, self.isScienceDirect, self.isSpringerLink, self.isWikimedia, self.isDoi, self.isGoogle, self.isTwitter, self.isLinkedin, self.isYoutube, self.isJstor, self.isWorldCat, self.isMicrosoft, self.isIp])))
from scrapy.exporters import JsonLinesItemExporter
from css.items import ComputerScienceConference
from css.items import AuthorItem
import calendar
import time
import os
class ComputerScienceConferencePipeline(object):
def open_spider(self, spider):
if spider.name == 'computer-science-conferences':
self.file = open(f"top-{calendar.timegm(time.gmtime())}.json", "wb+")
else:
self.file = open(f"top-parsed-{calendar.timegm(time.gmtime())}.json", "wb+")
self.file.write(b'[')
def close_spider(self, spider):
self.file.seek(-1, os.SEEK_END)
self.file.truncate()
self.file.write(b']')
self.exporter.finish_exporting()
self.exporter.file.close()
self.file.close()
def process_item(self, item, spider):
if not self.__filter_links__(item['website']):
if not isinstance(item, ComputerScienceConference):
return item
else:
raise DropItem(f"Conference website link is not fine {item['website']}")
self.exporter = JsonLinesItemExporter(self.file)
self.exporter.start_exporting()
self.exporter.export_item(item)
self.file.write(b',')
return item
class AuthorItemPipeline(object):
def process_item(self, item, spider):
if not isinstance(item, AuthorItem):
return item
# Do something with Apple
return item
......@@ -64,9 +64,9 @@ CONCURRENT_REQUESTS = 32
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
# ITEM_PIPELINES = {
# 'css.pipelines.ConferenceItemPipeline': 100,
# }
ITEM_PIPELINES = {
'css.pipelines.ComputerScienceConferencePipeline': 100,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
......@@ -89,4 +89,4 @@ CONCURRENT_REQUESTS = 32
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
RETRY_TIMES = 1
\ No newline at end of file
RETRY_TIMES = 1
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment