batchGame/bin/parseHomegate.py
2023-09-05 20:23:01 +02:00

324 lines
9.3 KiB
Python

import asyncio
from pprint import pprint
import re
from bs4 import BeautifulSoup
import hashlib
import soup
from urllib.parse import urlparse
from filter import filter
async def runAddArticles(pageCrawl: str, p, url):
pageUrl = url + str(pageCrawl)
s = soup.serve(pageUrl)
content = await s.htmlAsync()
responseFlag = await p.addArticles(content, pageCrawl)
return responseFlag
async def runAddArticleContent(url: str, key: str, p):
s = soup.serve(url)
content = await s.htmlAsync()
await p.addArticleContent(content, key)
async def init(filterFile, mode):
url = "https://www.homegate.ch/rent/apartment/city-zurich/matching-list?o=dateCreated-desc&ep="
baseUrl = "://".join([urlparse(url).scheme, urlparse(url).netloc])
pageCrawl = 1
crawlStep = 10
p = parse(baseUrl)
#get all different pages into parser class articleObj
while True:
runLst = [runAddArticles(i, p, url) for i in range(pageCrawl,pageCrawl+crawlStep)]
result = await asyncio.gather(*runLst)
pageCrawl += crawlStep
if False in result:
break
#loop through article object make content requests in specific kinds
popFilterArray = []
runLst = []
for key in p.articleObj:
article = p.articleObj[key]
match(filter(article, filterFile)):
case 'all':
if mode == 'full' :
runLst.append(runAddArticleContent(article['lnk'], key, p))
case 'some':
pass
case 'none':
popFilterArray.append(key)
case 'important':
p.articleObj[key]['prio'] = 'important'
if mode == 'full' :
runLst.append(runAddArticleContent(article['lnk'], key, p))
case 'pinned':
if mode == 'full' :
runLst.append(runAddArticleContent(article['lnk'], key, p))
p.articleObj[key]['prio'] = 'pinned'
for item in popFilterArray:
p.articleObj.pop(item)
await asyncio.gather(*runLst)
return p.articleObj
class parse:
def __init__(self, baseUrl):
self.baseUrl = baseUrl
self.articleObj = {}
async def addArticles(self, content, pageCrawl):
if pageCrawl > 1:
articleCount = pageCrawl*20
else:
articleCount = 0
Data = {}
DOMdocument = BeautifulSoup(content, 'html.parser')
results = DOMdocument.select('div[data-test="result-list"]')
if results is None:
connectionCheck = content.find('Error: Connection issue')
if connectionCheck != -1:
type = 'ERROR'
msg = 'Connection issue'
Data['INFO-OBJECT'] = {
'Titel': type,
'msg': msg,
'id': 'INFO-OBJECT',
'href': self.baseUrl,
'prio': 'pinned',
'content': 'none'
}
newArticleObj = {**self.articleObj, **Data}
self.articleObj = newArticleObj
return False
try:
resultList = results[0].select('[data-test="result-list-item"]')
except:
print('try list is none')
return False
if len(resultList) <= 0:
#print(f'try list is 0 {results[0]}')
return False
for article in resultList:
articleCount += 1
#room value
try:
room = article.findAll("span", class_=re.compile('ListItemRoomNumber_value_*'))[0].getText()
except:
room = 'none'
if room == 'none':
try:
room = article.findAll(class_=re.compile('HgListingRoomsLivingSpace_roomsLivingSpace_*'))[0].findAll('span')[0].getText()
size = article.findAll(class_=re.compile('HgListingRoomsLivingSpace_roomsLivingSpace_*'))[0].findAll('span')[1].getText()
room = room + size
except:
room = 'none'
#get description string
try:
desc = article.findAll(class_=re.compile('ListItemDescription_'))[0].getText()
if desc == None:
desc = 'none'
except:
desc = 'none'
if desc == 'none':
try:
desc = article.findAll(class_=re.compile('ListItemDescription_description_'))[0].getText()
if desc == None:
desc = 'none'
except:
desc = 'none'
if desc == 'none':
try:
desc = article.findAll(class_=re.compile('HgListingDescription_small_*'))[0].getText()
if desc == None:
desc = 'none'
except:
desc = 'none'
#get title string
try:
title = article.findAll('p', class_=re.compile('HgListingDescription_title_*'))[0].getText()
if title == None:
title = 'none'
except:
title = 'none'
if title == 'none':
res = re.search('\D\.', desc)
if res == None:
title = desc
else:
pos = res.span()[1]
title = desc[:pos]
length = 100
if len(title) > length:
if (title.find(' ', length) != -1):
length = title.find(' ', length)
title = title[:length] + ' ...'
title = title[:-5]+' ...'
if title == None:
title = 'none'
#get price value
try:
price = article.findAll(class_=re.compile('ListItemPrice_price_'))[0].getText()
if price == None:
price = 'none'
except:
price = 'none'
if price == 'none':
try:
price = article.findAll(class_=re.compile('HgListingCard_price_*'))[0].getText()
price = price.split('/')[0]
if price == None:
price = 'none'
except:
price = 'none'
#get address
address = 'none'
for line in article.findAll('p'):
string = line.getText()
result = re.search('80\d\d Z', string)
if result != None:
address = string
if address == 'none':
try:
address = article.findAll(class_=re.compile('HgListingCard_address_*'))[0].getText()
if address == None:
address = 'none'
except:
address = 'none'
#get image link
try:
image = article.findAll('img')[0].get('src')
except:
image = 'none'
if image == None:
image = 'none'
#get link
href = article.get('href')
if href == None:
href = article.find('a').get('href')
if href == None:
href = "/no/none"
#get site reference
siId = href.split('/')[len(href.split('/'))-1]
#create full link
lnk = "".join([self.baseUrl, href])
#create unique id
id = hashlib.md5("-".join([room, price, title, address, siId]).encode('utf-8')).hexdigest()
Data[id] = {
'Zimmer': room,
'Preis': price,
'Titel': title,
'Adresse': address,
'Beschreibung': desc,
'id': id,
'siId': siId,
'href': href,
'lnk': lnk,
'prio': 'normal',
'content': 'none',
'number': articleCount,
'imgUrl': image
}
if id in self.articleObj.keys():
print(f'found dublicate: {siId}')
newArticleObj = {**self.articleObj, **Data}
self.articleObj = newArticleObj
return True
async def addArticleContent(self, content, id, mode='add'):
DOMdocument = BeautifulSoup(content, 'html.parser')
results = DOMdocument.find("div", class_=re.compile('DetailPage_detailPageContent_*'))
if results is None:
return False
location = ''
beginn = ''
apply = ''
contact = ''
image = ''
try:
image = results.select('li[data-glide-index="0"]')[0].find('img').get('src')
except:
image = None
if image == None:
image = 'none'
Data = {
'Ort': location,
'Antritt': beginn,
'Anmelden': apply,
'Kontakt': contact,
'imgUrl': image
}
if mode == 'add':
self.articleObj[id]['content'] = Data
if mode == 'return':
return Data