Commit 19d094d8 authored by Orkut Karacalik's avatar Orkut Karacalik
Browse files

detect keywords

parent 4b3d878b
......@@ -2,47 +2,50 @@
# -*- coding: utf-8 -*-
import json
import pandas as pd
import re
from functools import reduce
with open('conference-links.json') as f:
conference_links = json.load(f)
with open('speakers-method-1-1.json') as f:
speakers_method_1 = json.load(f)
speakers = reduce(lambda x, y: x + y,
list(map(lambda x: x['speakers'], speakers_method_1)))
links = reduce(lambda x, y: x + y,
list(map(lambda x: [x['website']], conference_links)))
links.sort()
isPdf = lambda link: '.pdf' in link
isToc = lambda link: link.endswith('.toc') or link.endswith('.sg')
isUniTrier = lambda link: 'uni-trier.de' in link
isAcm = lambda link: 'acm.org' in link
isAcmPortal = lambda link: 'portal.acm' in link
isBlogspot = lambda link: 'blogspot' in link
isPs = lambda link: '.ps' in link
isSourceforge = lambda link: 'sourceforge' in link
isIeeExplore = lambda link: 'ieeexplore.ieee.org' in link
isIsni = lambda link: 'isni.org' in link
isOxfordJournal = lambda link: 'oxfordjournals' in link
isCambridgeJournal = lambda link: 'journals.cambridge' in link
isScienceDirect = lambda link: 'sciencedirect.com' in link
isSpringerLink = lambda link: 'springerlink' in link or 'link.springer' in link or 'springer.com' in link
isWikimedia = lambda link: 'wikimedia' in link
isDoi = lambda link: 'doi.org' in link
isGoogle = lambda link: 'google.com' in link or 'google.de' in link or 'google.it' in link
isTwitter = lambda link: 'twitter.com' in link
isLinkedin = lambda link: 'linkedin.com' in link
isYoutube = lambda link: 'youtube' in link
isJstor = lambda link: 'jstor.org' in link
isWorldCat = lambda link: 'worldcat.org' in link
isMicrosoft = lambda link: 'research.microsoft' in link
isIp = lambda link: bool(re.search(r'\d+\.\d+\.\d+\.\d+', link))
def filter_links(link, filters):
return reduce(lambda x, y: x or y, map(lambda x: x(link), filters))
links_filtered = list(filter(lambda link: not filter_links(link, [isPdf, isToc, isUniTrier, isAcm, isAcmPortal, isBlogspot, isPs, isSourceforge, isIeeExplore, isIsni, isOxfordJournal, isCambridgeJournal, isScienceDirect, isSpringerLink, isWikimedia, isDoi, isGoogle, isTwitter, isLinkedin, isYoutube, isJstor, isWorldCat, isMicrosoft, isIp]), links))
def hasHref(items):
return reduce(lambda x, y: x or y, map(lambda i: bool(re.search(r'href=', i)), items))
def getNoHrefCount(df):
return df[df.hasHref == False].shape
#with open('../scrapers/top-computer-science-conferences-keywords-2.json') as f:
# conferences = json.load(f)
df_1 = pd.read_json('../scrapers/top-computer-science-conferences-keywords-1.json')
df_1['hasHref'] = df_1['hasKeywords'].apply(hasHref)
df_new = pd.read_json('../scrapers/top-computer-science-conferences-keywords-5.json')
df_new['hasHref'] = df_new['hasKeywords'].apply(hasHref)
diff_1 = df_1[~df_1.name.isin(df_new.name)]
diff_2 = df_new[~df_new.name.isin(df_1.name)]
#def remove_whitespace(text):
# return re.sub(' +|&nbsp| ', ' ', text)
#
#def get_persons(entries):
# entries_person = filter(lambda x: x.label_ == 'PER', entries)
# return list(map(lambda x: x.text, entries_person))
#
#flatten = lambda l: [item for sublist in l for item in sublist]
#with open('speakers-method-2-2.json') as f:
# speakers_method = json.load(f)
#speakers = reduce(lambda x, y: x + y,
# list(map(lambda x: x['speakers'], speakers_method)))
#speakers = list(map(lambda x: { "a": x['field'], "names": flatten(x['speakers']) }, speakers_method))
#links = reduce(lambda x, y: x + y,
# list(map(lambda x: [x['website']], conference_links)))
#links.sort()
#page = remove_whitespace(speakers_method_1[0]['speaker_pages'][0])
#nlp = spacy.load('xx')
#doc = nlp(page)
#
#for ent in doc.ents:
# if ent.label_== 'PER': print(type(ent.label_), ent.label_, ent.text, ent.start_char, ent.end_char)
......@@ -13,6 +13,8 @@ class ConferenceItem(Item):
name = Field()
link = Field()
website = Field()
speaker_urls = Field()
speaker_pages = Field()
speakers = Field()
class AuthorProfileItem(Item):
......@@ -22,3 +24,13 @@ class AuthorProfileItem(Item):
h_index = Field()
citations_last_5_year = Field()
h_index_lat_5_year = Field()
class ComputerScienceConference(Item):
name = Field()
date = Field()
fields = Field()
website = Field()
hasKeywords = Field()
speaker_urls = Field()
speaker_pages = Field()
speakers = Field()
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http.request import Request
from css.items import ComputerScienceConference
from w3lib.html import remove_tags, strip_html5_whitespace, replace_escape_chars
class ComputerScienceConferencesSpider(scrapy.Spider):
name = 'computer-science-conferences'
allowed_domains = ['guide2research.com']
start_urls = ['http://www.guide2research.com/topconf/']
def start_requests(self):
for url in self.start_urls:
yield Request(url, self.parse, headers={'Content-Type': 'application/json', 'charset': 'UTF-8'})
def parse(self, response):
for link in response.xpath("//h4/a/@href").extract():
yield Request(response.urljoin(link), self.parse_conference_profile, headers={'Content-Type': 'application/json', 'charset': 'UTF-8'})
def parse_conference_profile(self, response):
computerScienceConference = ComputerScienceConference()
computerScienceConference['name'] = response.xpath("//header/h1/text()").extract_first()
computerScienceConference['date'] = strip_html5_whitespace(replace_escape_chars(response.xpath('//td[re:match(text(), "conference dates", "i")]/following-sibling::node()/text()').extract_first()))
computerScienceConference['fields'] = list(map(lambda x: x.split("http://www.guide2research.com/topconf/")[1], response.xpath("//table//a[(contains(@href, 'guide2research.com/topconf/'))]/@href").extract()))
computerScienceConference['website'] = response.xpath("//table//a[starts-with(@href, 'http') and not(contains(@href, 'guide2research'))]/@href").extract_first()
computerScienceConference['hasKeywords'] = None
computerScienceConference['speaker_urls'] = []
computerScienceConference['speaker_pages'] = []
computerScienceConference['speakers'] = []
yield computerScienceConference
def check_keywords(self, response):
response.xpath('//*[re:match(text(), "speaker|invited|keynote", "i")]').extract()
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http.request import Request
import json
class ConferenceWebsiteSpider(scrapy.Spider):
name = 'conference-website'
def start_requests(self):
with open('top-computer-science-conferences.json') as f:
data = json.load(f)
for conference in data:
yield Request(conference['website'], self.parse, meta=dict(conference=conference))
# 1. it could be in current page
# 1.1. it could be within picked elements
# 2. it could in linked page/fragment
# 2.1 check hrefs w/ regex
# 2.2 text could be in descendant elements
def parse(self, response):
result = []
texts = response.xpath('//*[re:match(text(), "speaker|invited|keynote", "i")]/ancestor::*[@href]').extract()
if len(texts) == 0:
texts = response.xpath('//*[re:match(text(), "speaker|invited|keynote", "i")]').extract()
result.extend(texts)
if len(result) > 0:
conference = response.meta['conference']
conference['hasKeywords'] = result # unique based on href
yield conference
......@@ -12,8 +12,6 @@ class CsConferencesSpider(scrapy.Spider):
'https://en.wikipedia.org/wiki/List_of_computer_science_conferences'
]
def parse(self, response):
items = response.xpath("// h2[span[contains(@class, 'mw-headline')]] | //h3[span[contains(@class, 'mw-headline')]] | //*[*/ul/li/a][@class != 'toc']/ul/li/a").extract()
currentField = "-"
......@@ -41,6 +39,8 @@ class CsConferencesSpider(scrapy.Spider):
links = response.xpath("//div[@id='bodyContent']//a[starts-with(@href, 'http') and not(contains(@href, 'wikipedia')) ]/@href").extract()
for link in links:
conferenceItem['website'] = link
conferenceItem['speaker_urls'] = []
conferenceItem['speaker_pages'] = []
conferenceItem['speakers'] = []
yield conferenceItem
......
......@@ -2,7 +2,10 @@
import scrapy
from scrapy.http import HtmlResponse
from scrapy.http.request import Request
from w3lib.html import remove_tags, strip_html5_whitespace, replace_escape_chars
import json
import spacy
import re
# DONE: load json file
# TODO: clean html response
# TODO: if not link found, search through whole pages (links should be cleaned)
......@@ -11,35 +14,41 @@ class SpeakersSpider(scrapy.Spider):
name = 'speakers'
start_urls = ['http://www.ase2014.org/',
'https://en.wikipedia.org/wiki/Federated_Computing_Research_Conference']
nlp = spacy.load('xx')
def start_requests(self):
with open('conference-links-1.json') as f:
with open('conference-links-2.json') as f:
data = json.load(f)
for index, conference in enumerate(data):
yield Request(conference['website'], self.parse, meta=dict(conference=conference))
def parse(self, response):
# print(f"{response.meta['fieldIndex']} - {response.meta['conferenceIndex']}")
# if response.meta['fieldIndex'] == 0 and response.meta['conferenceIndex'] == 0:
# response.meta['data'][0]['conferences'][0]['speakers'] = ["ben x"]
# # print(f"{response.meta['data'][0]['conferences'][0]['speakers']}")
# yield response.meta['data'][0]
# keywords = ['keynotes', 'speaker', 'invited', 'program']
# via keywords
links = response.xpath('//a[descendant::*[re:match(text(), "keynotes|speaker|invited|program", "i")]]/@href').extract()
links = response.xpath('//*[descendant::*[re:match(text(), "keynotes|speaker|invited|program", "i")]]/@href').extract()
for link in links:
nextPath = link if link.startswith('http') else response.urljoin(link)
response.meta['conference']['speaker_urls'].append(nextPath)
yield Request(nextPath, meta=dict(conference=response.meta['conference']), callback=self.parse_names)
# otherwise look for all links
def parse_names(self, response):
speakers = response.xpath('//*[re:match(text(), "^prof\.|^dr\.", "i")]/text()').extract()
text = self.__remove_whitespace__(remove_tags(
replace_escape_chars(strip_html5_whitespace(response.text))))
# speakers = response.xpath('//*[re:match(text(), "^prof\.|^dr\.", "i")]/text()').extract()
speakers = self.__find_names__(text)
conference = response.meta['conference']
conference['speakers'] = speakers
conference['speakers'].append(speakers)
# conference['speaker_urls'].append(response.url)
conference['speaker_pages'].append(text)
yield conference
def __remove_whitespace__(self, text):
return re.sub(' +|&nbsp| ', ' ', text)
def __find_names__(self, text):
doc = self.nlp(text)
entries_person = filter(lambda x: x.label_ == 'PER', doc.ents)
return list(map(lambda x: x.text, entries_person))
# text = remove_tags(replace_escape_chars(strip_html5_whitespace(response.text)))
# nlp = spacy.load('xx')
# doc = nlp(text)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment