Commit 29b5e7bf authored by Orkut Karaçalık's avatar Orkut Karaçalık
Browse files

a few additions

parent 71c847c0
This diff is collapsed.
......@@ -3,15 +3,41 @@
# EXAM Q: shorthest path, how to compute closeness centrality
import pandas as pd
import re
import w3lib
from scrapy.http import HtmlResponse
from difflib import SequenceMatcher, Differ
def convert_html(text):
return HtmlResponse(url="a", body=text, encoding="utf-8")
df_raw = pd.read_json('pages-1.json')
def pre_process(text):
# convert to html
text_1 = max(convert_html(text).xpath('//body').extract(), key=len)
# remove comments
text_2 = w3lib.html.remove_comments(text_1, encoding="utf-8")
# remove script
text_3 = w3lib.html.remove_tags_with_content(text_2, ('script', 'style', 'img'), encoding="utf-8")
# replace tabs, newlines
# text_4 = w3lib.html.replace_escape_chars(text_3, replace_by = ' ', encoding="utf-8")
text_4 = text_3
# replace multispaces
text_5 = re.sub(' +|&nbsp| ', ' ', text_4)
return text_5
df_raw = pd.read_json('pages-2.json')
df = df_raw.drop(['hasKeywords', 'website'], axis=1)
df = df.rename(index=str, columns={"speaker_urls": "url", "speaker_pages": "page"})
#df['lenPre'] = df.page.apply(lambda x: len(x))
df.page = df.page.apply(pre_process)
df.home = df.home.apply(pre_process)
#df['lenPost'] = df.page.apply(lambda x: len(x))
#df['reduce'] = (df.lenPre - df.lenPost) / df.lenPre * 100
#df['sim'] =
#df.to_json('pages-cleaned-1.json', orient='records')
index = 1
index = 137
home = df.home[index]
page = df.page[index]
url = df.url[index]
df.to_json('pages-cleaned-1.json', orient='records')
page_old = df_raw.speaker_pages[index]
......@@ -31,4 +31,4 @@ class ComputerScienceConferencesSpider(scrapy.Spider):
def check_keywords(self, response):
response.xpath('//*[re:match(text(), "speaker|invited|keynote", "i")]').extract()
......@@ -11,16 +11,16 @@ class ConferenceSpeakersSpider(scrapy.Spider):
name = 'conference-speakers'
nlp = spacy.load('xx')
#+ 1. process html items: parse, urljoin, duplicates
# 1. process html items: parse, urljoin, duplicates
# 2. make request
# 3. ner
def start_requests(self):
with open('top-computer-science-conferences-keywords-6.json') as f:
with open('top-computer-science-conferences-keywords-7.json') as f:
data = json.load(f)
for conference in data:
for url in conference['speaker_urls']:
yield Request(url, self.parse, meta=dict(conference=conference))
def parse(self, response):
page = response.text
# text = self.__remove_whitespace__(remove_tags(replace_escape_chars(strip_html5_whitespace(response.text))))
......
......@@ -12,7 +12,7 @@ class ConferenceWebsiteSpider(scrapy.Spider):
data = json.load(f)
for conference in data:
yield Request(conference['website'], self.parse, meta=dict(conference=conference))
# 1. it could be in current page
# 1.1. it could be within picked elements
# 2. it could in linked page/fragment
......@@ -24,17 +24,18 @@ class ConferenceWebsiteSpider(scrapy.Spider):
if len(texts) == 0:
texts = response.xpath('//*[re:match(text(), "speaker|invited|keynote", "i")]').extract()
result.extend(texts)
if len(result) > 0:
conference = response.meta['conference']
conference['hasKeywords'] = result
conference['speaker_urls'] = list(set(self.__join_urls__(self.__extract_urls__(result), response)))
conference['home'] = response.text
yield conference
def __extract_urls__(self, items):
get_href = lambda item: HtmlResponse(url="a", body=item, encoding="utf-8").xpath('//*/@href').extract_first()
return list(map(get_href, items))
def __join_urls__(self, links, response):
filter_none = lambda links: list(filter(None.__ne__, links))
next_url = lambda link: link if link.startswith('http') else response.urljoin(link)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment