Commit 4b3d878b authored by Orkut Karacalik's avatar Orkut Karacalik
Browse files

improvements

- conference item updated
- conference pipeline added
- basic name extracting
parent 9e255674
.directory
*.pyc
*.ini
*.json
\ No newline at end of file
*.json
*.txt
\ No newline at end of file
......@@ -3,12 +3,15 @@
## Methodology
1. Select field in CS
2. Search for external links
3. Use google to search in-site for keywords ['speaker', ...]
4. Get pages and look for keywords ['bio', 'biography', ...]
5. Save available data
- it can include conferences
3. Open webpage
- it can include conference links
- conference links w/ year
4. Look for links that includes keywords
- link-text includes keywords
- link-href includes keywords
- it can be same page
### Python Packages
1. scrapy
2. spacy
3.
\ No newline at end of file
2. spacy
\ No newline at end of file
......@@ -2,10 +2,47 @@
# -*- coding: utf-8 -*-
import json
import re
from functools import reduce
with open('conferences.json') as f:
conferences = json.load(f)
with open('conference-links.json') as f:
conference_links = json.load(f)
links = reduce(lambda x, y: x + y, list(map(lambda x: x['conferences'], conferences)))
with open('speakers-method-1-1.json') as f:
speakers_method_1 = json.load(f)
speakers = reduce(lambda x, y: x + y,
list(map(lambda x: x['speakers'], speakers_method_1)))
links = reduce(lambda x, y: x + y,
list(map(lambda x: [x['website']], conference_links)))
links.sort()
isPdf = lambda link: '.pdf' in link
isToc = lambda link: link.endswith('.toc') or link.endswith('.sg')
isUniTrier = lambda link: 'uni-trier.de' in link
isAcm = lambda link: 'acm.org' in link
isAcmPortal = lambda link: 'portal.acm' in link
isBlogspot = lambda link: 'blogspot' in link
isPs = lambda link: '.ps' in link
isSourceforge = lambda link: 'sourceforge' in link
isIeeExplore = lambda link: 'ieeexplore.ieee.org' in link
isIsni = lambda link: 'isni.org' in link
isOxfordJournal = lambda link: 'oxfordjournals' in link
isCambridgeJournal = lambda link: 'journals.cambridge' in link
isScienceDirect = lambda link: 'sciencedirect.com' in link
isSpringerLink = lambda link: 'springerlink' in link or 'link.springer' in link or 'springer.com' in link
isWikimedia = lambda link: 'wikimedia' in link
isDoi = lambda link: 'doi.org' in link
isGoogle = lambda link: 'google.com' in link or 'google.de' in link or 'google.it' in link
isTwitter = lambda link: 'twitter.com' in link
isLinkedin = lambda link: 'linkedin.com' in link
isYoutube = lambda link: 'youtube' in link
isJstor = lambda link: 'jstor.org' in link
isWorldCat = lambda link: 'worldcat.org' in link
isMicrosoft = lambda link: 'research.microsoft' in link
isIp = lambda link: bool(re.search(r'\d+\.\d+\.\d+\.\d+', link))
def filter_links(link, filters):
return reduce(lambda x, y: x or y, map(lambda x: x(link), filters))
links_filtered = list(filter(lambda link: not filter_links(link, [isPdf, isToc, isUniTrier, isAcm, isAcmPortal, isBlogspot, isPs, isSourceforge, isIeeExplore, isIsni, isOxfordJournal, isCambridgeJournal, isScienceDirect, isSpringerLink, isWikimedia, isDoi, isGoogle, isTwitter, isLinkedin, isYoutube, isJstor, isWorldCat, isMicrosoft, isIp]), links))
......@@ -12,10 +12,7 @@ class ConferenceItem(Item):
subfield = Field()
name = Field()
link = Field()
conferences = Field()
class ConferenceWebsite(Item):
link = Field()
website = Field()
speakers = Field()
class AuthorProfileItem(Item):
......
......@@ -4,8 +4,42 @@
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.exceptions import DropItem
from functools import reduce
import re
class ConferenceItemPipeline(object):
isPdf = lambda self, link: '.pdf' in link
isToc = lambda self, link: link.endswith('.toc') or link.endswith('.sg')
isUniTrier = lambda self, link: 'uni-trier.de' in link
isAcm = lambda self, link: 'acm.org' in link
isAcmPortal = lambda self, link: 'portal.acm' in link
isBlogspot = lambda self, link: 'blogspot' in link
isPs = lambda self, link: '.ps' in link
isSourceforge = lambda self, link: 'sourceforge' in link
isIeeExplore = lambda self, link: 'ieeexplore.ieee.org' in link
isIsni = lambda self, link: 'isni.org' in link
isOxfordJournal = lambda self, link: 'oxfordjournals' in link
isCambridgeJournal = lambda self, link: 'journals.cambridge' in link
isScienceDirect = lambda self, link: 'sciencedirect.com' in link
isSpringerLink = lambda self, link: 'springerlink' in link or 'link.springer' in link or 'springer.com' in link
isWikimedia = lambda self, link: 'wikimedia' in link
isDoi = lambda self, link: 'doi.org' in link
isGoogle = lambda self, link: 'google.com' in link or 'google.de' in link or 'google.it' in link
isTwitter = lambda self, link: 'twitter.com' in link
isLinkedin = lambda self, link: 'linkedin.com' in link
isYoutube = lambda self, link: 'youtube' in link
isJstor = lambda self, link: 'jstor.org' in link
isWorldCat = lambda self, link: 'worldcat.org' in link
isMicrosoft = lambda self, link: 'research.microsoft' in link
isIp = lambda self, link: bool(re.search(r'\d+\.\d+\.\d+\.\d+', link))
def __filter_links__(self, link):
return reduce(lambda x, y: x or y, list(map(lambda x: x(link), [self.isPdf, self.isToc, self.isUniTrier, self.isAcm, self.isAcmPortal, self.isBlogspot, self.isPs, self.isSourceforge, self.isIeeExplore, self.isIsni, self.isOxfordJournal, self.isCambridgeJournal, self.isScienceDirect, self.isSpringerLink, self.isWikimedia, self.isDoi, self.isGoogle, self.isTwitter, self.isLinkedin, self.isYoutube, self.isJstor, self.isWorldCat, self.isMicrosoft, self.isIp])))
class CssPipeline(object):
def process_item(self, item, spider):
return item
if not self.__filter_links__(item['website']):
return item
else:
raise DropItem(f"Conference website link is not fine {item['website']}")
......@@ -22,7 +22,7 @@ NEWSPIDER_MODULE = 'css.spiders'
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
......@@ -39,34 +39,34 @@ ROBOTSTXT_OBEY = False
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# SPIDER_MIDDLEWARES = {
# 'css.middlewares.CssSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# DOWNLOADER_MIDDLEWARES = {
# 'css.middlewares.CssDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'css.pipelines.CssPipeline': 300,
#}
ITEM_PIPELINES = {
'css.pipelines.ConferenceItemPipeline': 100,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
......
......@@ -2,7 +2,8 @@
import scrapy
from scrapy.http import HtmlResponse
from scrapy.http.request import Request
from css.items import ConferenceItem, ConferenceWebsite
from css.items import ConferenceItem
from copy import deepcopy
class CsConferencesSpider(scrapy.Spider):
name = 'cs-conferences'
......@@ -11,6 +12,8 @@ class CsConferencesSpider(scrapy.Spider):
'https://en.wikipedia.org/wiki/List_of_computer_science_conferences'
]
def parse(self, response):
items = response.xpath("// h2[span[contains(@class, 'mw-headline')]] | //h3[span[contains(@class, 'mw-headline')]] | //*[*/ul/li/a][@class != 'toc']/ul/li/a").extract()
currentField = "-"
......@@ -26,18 +29,20 @@ class CsConferencesSpider(scrapy.Spider):
if currentField == 'See also':
break
else:
confItem = ConferenceItem()
confItem['field'] = currentField
confItem['subfield'] = currentSubfield
confItem['name'] = htmlResponse.xpath("//a/text()").extract_first()
confItem['link'] = response.urljoin(htmlResponse.xpath("//a/@href").extract_first())
yield Request(confItem['link'], meta={'confItem': confItem}, callback=self.parse_conferences)
conferenceItem = ConferenceItem()
conferenceItem['field'] = currentField
conferenceItem['subfield'] = currentSubfield
conferenceItem['name'] = htmlResponse.xpath("//a/text()").extract_first()
conferenceItem['link'] = response.urljoin(htmlResponse.xpath("//a/@href").extract_first())
yield Request(conferenceItem['link'], meta=dict(conferenceItem=conferenceItem), callback=self.parse_conferences)
def parse_conferences(self, response):
confItem = response.meta['confItem']
conferenceItem = response.meta['conferenceItem']
links = response.xpath("//div[@id='bodyContent']//a[starts-with(@href, 'http') and not(contains(@href, 'wikipedia')) ]/@href").extract()
confItem['conferences'] = [ ConferenceWebsite(link=l, speakers=[]) for l in links]
yield confItem
for link in links:
conferenceItem['website'] = link
conferenceItem['speakers'] = []
yield conferenceItem
def find_pages(self, response):
return
......
......@@ -2,31 +2,42 @@
import scrapy
from scrapy.http import HtmlResponse
from scrapy.http.request import Request
# TODO: load json file
import json
# DONE: load json file
# TODO: clean html response
# TODO: if not link found, search through whole pages (links should be cleaned)
# TODO: activate ner
class SpeakersSpider(scrapy.Spider):
name = 'speakers'
start_urls = ['http://www.ase2014.org/']
start_urls = ['http://www.ase2014.org/',
'https://en.wikipedia.org/wiki/Federated_Computing_Research_Conference']
def start_requests(self):
with open('conference-links-1.json') as f:
data = json.load(f)
for index, conference in enumerate(data):
yield Request(conference['website'], self.parse, meta=dict(conference=conference))
def parse(self, response):
keywords = ['keynotes', 'speaker', 'invited', 'program']
# print(f"{response.meta['fieldIndex']} - {response.meta['conferenceIndex']}")
# if response.meta['fieldIndex'] == 0 and response.meta['conferenceIndex'] == 0:
# response.meta['data'][0]['conferences'][0]['speakers'] = ["ben x"]
# # print(f"{response.meta['data'][0]['conferences'][0]['speakers']}")
# yield response.meta['data'][0]
# keywords = ['keynotes', 'speaker', 'invited', 'program']
# via keywords
links = response.xpath('//a[descendant::*[re:match(text(), "keynotes|speaker|invited|program", "i")]]/@href').extract()
for link in links:
nextPath = link if link.startswith('http') else response.urljoin(link)
yield Request(nextPath, meta={'link': nextPath}, callback=self.parse_names)
yield Request(nextPath, meta=dict(conference=response.meta['conference']), callback=self.parse_names)
# otherwise look for all links
def parse_names(self, response):
link = response.meta['link']
speakers = response.xpath('//*[re:match(text(), "^prof|^dr", "i")]/text()').extract()
yield {
'link': link,
'speakers': speakers
}
speakers = response.xpath('//*[re:match(text(), "^prof\.|^dr\.", "i")]/text()').extract()
conference = response.meta['conference']
conference['speakers'] = speakers
yield conference
# text = remove_tags(replace_escape_chars(strip_html5_whitespace(response.text)))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment