276 lines
7.6 KiB
Python
276 lines
7.6 KiB
Python
import asyncio
|
|
from pprint import pprint
|
|
import re
|
|
from bs4 import BeautifulSoup
|
|
import hashlib
|
|
import soup
|
|
from urllib.parse import urlparse
|
|
from filter import filter
|
|
|
|
async def runAddArticles(pageCrawl: str, p, url):
|
|
|
|
pageUrl = url + str(pageCrawl)
|
|
s = soup.serve(pageUrl)
|
|
content = await s.htmlAsync()
|
|
responseFlag = await p.addArticles(content, pageCrawl)
|
|
return responseFlag
|
|
|
|
async def runAddArticleContent(url: str, key: str, p):
|
|
s = soup.serve(url)
|
|
content = await s.htmlAsync()
|
|
await p.addArticleContent(content, key)
|
|
|
|
|
|
async def init(filterFile, mode):
|
|
url = "https://www.homegate.ch/rent/apartment/city-zurich/matching-list?o=dateCreated-desc&ep="
|
|
baseUrl = "://".join([urlparse(url).scheme, urlparse(url).netloc])
|
|
pageCrawl = 1
|
|
crawlStep = 10
|
|
|
|
p = parse(baseUrl)
|
|
|
|
#get all different pages into parser class articleObj
|
|
while True:
|
|
runLst = [runAddArticles(i, p, url) for i in range(pageCrawl,pageCrawl+crawlStep)]
|
|
result = await asyncio.gather(*runLst)
|
|
pageCrawl += crawlStep
|
|
if False in result:
|
|
break
|
|
|
|
|
|
|
|
#loop through article object make content requests in specific kinds
|
|
|
|
popFilterArray = []
|
|
runLst = []
|
|
for key in p.articleObj:
|
|
article = p.articleObj[key]
|
|
|
|
match(filter(article, filterFile)):
|
|
case 'all':
|
|
if mode == 'full' :
|
|
runLst.append(runAddArticleContent(article['lnk'], key, p))
|
|
case 'some':
|
|
pass
|
|
case 'none':
|
|
popFilterArray.append(key)
|
|
case 'important':
|
|
p.articleObj[key]['prio'] = 'important'
|
|
if mode == 'full' :
|
|
runLst.append(runAddArticleContent(article['lnk'], key, p))
|
|
case 'pinned':
|
|
if mode == 'full' :
|
|
runLst.append(runAddArticleContent(article['lnk'], key, p))
|
|
p.articleObj[key]['prio'] = 'pinned'
|
|
|
|
for item in popFilterArray:
|
|
p.articleObj.pop(item)
|
|
|
|
await asyncio.gather(*runLst)
|
|
|
|
return p.articleObj
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class parse:
|
|
|
|
def __init__(self, baseUrl):
|
|
self.baseUrl = baseUrl
|
|
self.articleObj = {}
|
|
|
|
async def addArticles(self, content, pageCrawl):
|
|
|
|
if pageCrawl > 1:
|
|
articleCount = pageCrawl*20
|
|
else:
|
|
articleCount = 0
|
|
|
|
Data = {}
|
|
DOMdocument = BeautifulSoup(content, 'html.parser')
|
|
|
|
results = DOMdocument.select('div[data-test="result-list"]')
|
|
|
|
if results is None:
|
|
|
|
connectionCheck = content.find('Error: Connection issue')
|
|
if connectionCheck != -1:
|
|
type = 'ERROR'
|
|
msg = 'Connection issue'
|
|
Data['INFO-OBJECT'] = {
|
|
'Titel': type,
|
|
'msg': msg,
|
|
'id': 'INFO-OBJECT',
|
|
'href': self.baseUrl,
|
|
'prio': 'pinned',
|
|
'content': 'none'
|
|
}
|
|
newArticleObj = {**self.articleObj, **Data}
|
|
self.articleObj = newArticleObj
|
|
|
|
return False
|
|
|
|
try:
|
|
resultList = results[0].select('[data-test="result-list-item"]')
|
|
except:
|
|
print('try list is none')
|
|
return False
|
|
|
|
if len(resultList) <= 0:
|
|
#print(f'try list is 0 {results[0]}')
|
|
return False
|
|
|
|
|
|
for article in resultList:
|
|
|
|
articleCount += 1
|
|
|
|
#room value
|
|
try:
|
|
room = article.findAll("span", class_=re.compile('ListItemRoomNumber_value_*'))[0].getText()
|
|
except:
|
|
room = 'none'
|
|
|
|
#get description string
|
|
try:
|
|
desc = article.findAll(class_=re.compile('ListItemDescription_'))[0].getText()
|
|
if desc == None:
|
|
desc = 'none'
|
|
except:
|
|
desc = 'none'
|
|
|
|
if desc == 'none':
|
|
try:
|
|
desc = article.findAll(class_=re.compile('ListItemDescription_description_'))[0].getText()
|
|
if desc == None:
|
|
desc = 'none'
|
|
except:
|
|
desc = 'none'
|
|
|
|
#get title string
|
|
|
|
res = re.search('\D\.', desc)
|
|
if res == None:
|
|
title = desc
|
|
else:
|
|
pos = res.span()[1]
|
|
title = desc[:pos]
|
|
length = 100
|
|
if len(title) > length:
|
|
if (title.find(' ', length) != -1):
|
|
length = title.find(' ', length)
|
|
title = title[:length] + ' ...'
|
|
title = title[:-5]+' ...'
|
|
|
|
|
|
if title == None:
|
|
title = 'none'
|
|
|
|
#get price value
|
|
try:
|
|
price = article.findAll(class_=re.compile('ListItemPrice_price_'))[0].getText()
|
|
if price == None:
|
|
price = 'none'
|
|
except:
|
|
price = 'none'
|
|
|
|
address = 'tbd'
|
|
#get address
|
|
for line in article.findAll('p'):
|
|
string = line.getText()
|
|
result = re.search('80\d\d Z', string)
|
|
if result != None:
|
|
address = string
|
|
|
|
#get image link
|
|
try:
|
|
image = article.findAll('img')[0].get('src')
|
|
except:
|
|
image = 'none'
|
|
|
|
if image == None:
|
|
image = 'none'
|
|
|
|
#get link
|
|
href = article.get('href')
|
|
if href == None:
|
|
href = article.find('a').get('href')
|
|
if href == None:
|
|
href = "/no/none"
|
|
|
|
#get site reference
|
|
siId = href.split('/')[len(href.split('/'))-1]
|
|
|
|
#create full link
|
|
lnk = "".join([self.baseUrl, href])
|
|
|
|
#create unique id
|
|
id = hashlib.md5("-".join([room, price, title, address, siId]).encode('utf-8')).hexdigest()
|
|
|
|
|
|
Data[id] = {
|
|
'Zimmer': room,
|
|
'Preis': price,
|
|
'Titel': title,
|
|
'Adresse': address,
|
|
'Beschreibung': desc,
|
|
'id': id,
|
|
'siId': siId,
|
|
'href': href,
|
|
'lnk': lnk,
|
|
'prio': 'normal',
|
|
'content': 'none',
|
|
'number': articleCount,
|
|
'imgUrl': image
|
|
}
|
|
|
|
if id in self.articleObj.keys():
|
|
print(f'found dublicate: {siId}')
|
|
|
|
newArticleObj = {**self.articleObj, **Data}
|
|
self.articleObj = newArticleObj
|
|
return True
|
|
|
|
async def addArticleContent(self, content, id, mode='add'):
|
|
|
|
DOMdocument = BeautifulSoup(content, 'html.parser')
|
|
|
|
results = DOMdocument.find("div", class_=re.compile('DetailPage_detailPageContent_*'))
|
|
|
|
if results is None:
|
|
return False
|
|
|
|
|
|
|
|
location = ''
|
|
beginn = ''
|
|
apply = ''
|
|
contact = ''
|
|
image = ''
|
|
|
|
try:
|
|
image = results.select('li[data-glide-index="0"]')[0].find('img').get('src')
|
|
except:
|
|
image = None
|
|
|
|
if image == None:
|
|
image = 'none'
|
|
|
|
|
|
Data = {
|
|
'Ort': location,
|
|
'Antritt': beginn,
|
|
'Anmelden': apply,
|
|
'Kontakt': contact,
|
|
'imgUrl': image
|
|
}
|
|
|
|
|
|
if mode == 'add':
|
|
self.articleObj[id]['content'] = Data
|
|
|
|
if mode == 'return':
|
|
return Data
|