Commit 9e255674 authored by Orkut Karacalik's avatar Orkut Karacalik
Browse files

set todos

parent a0f4ab88
......@@ -18,12 +18,10 @@ class CsConferencesSpider(scrapy.Spider):
for (index, item) in enumerate(items):
htmlResponse = self.__convert_html__(item)
if htmlResponse.xpath("//h2/span[contains(@class, 'mw-headline')]/text()").extract_first() is not None:
currentField = htmlResponse.xpath(
"//h2/span[contains(@class, 'mw-headline')]/text()").extract_first()
currentField = htmlResponse.xpath("//h2/span[contains(@class, 'mw-headline')]/text()").extract_first()
currentSubfield = "-"
elif htmlResponse.xpath("//h3/span[contains(@class, 'mw-headline')]/text()").extract_first() is not None:
currentSubfield = htmlResponse.xpath(
"//h3/span[contains(@class, 'mw-headline')]/text()").extract_first()
currentSubfield = htmlResponse.xpath("//h3/span[contains(@class, 'mw-headline')]/text()").extract_first()
if currentField == 'See also':
......@@ -37,10 +35,15 @@ class CsConferencesSpider(scrapy.Spider):
def parse_conferences(self, response):
confItem = response.meta['confItem']
links = response.xpath(
"//div[@id='bodyContent']//a[starts-with(@href, 'http') and not(contains(@href, 'wikipedia')) ]/@href").extract()
links = response.xpath("//div[@id='bodyContent']//a[starts-with(@href, 'http') and not(contains(@href, 'wikipedia')) ]/@href").extract()
confItem['conferences'] = [ ConferenceWebsite(link=l, speakers=[]) for l in links]
yield confItem
def find_pages(self, response):
def parse_speakers(self, response):
def __convert_html__(self, item):
return HtmlResponse(url="", body=item, encoding="utf-8")
......@@ -3,6 +3,7 @@ import scrapy
import datetime
from css.items import AuthorProfileItem
# TODO: get puppeteer
class HIndexSpider(scrapy.Spider):
name = 'h-index'
allowed_domains = ['']
......@@ -3,6 +3,10 @@ import scrapy
from scrapy.http import HtmlResponse
from scrapy.http.request import Request
# TODO: load json file
# TODO: clean html response
# TODO: if not link found, search through whole pages (links should be cleaned)
# TODO: activate ner
class SpeakersSpider(scrapy.Spider):
name = 'speakers'
start_urls = ['']
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment