import asyncio from pprint import pprint import re from bs4 import BeautifulSoup import hashlib import soup from urllib.parse import urlparse from filter import filter async def runAddArticles(pageCrawl: str, p, url): pageUrl = url + str(pageCrawl) s = soup.serve(pageUrl) content = await s.htmlAsync() responseFlag = await p.addArticles(content, pageCrawl) return responseFlag async def runAddArticleContent(url: str, key: str, p): s = soup.serve(url) content = await s.htmlAsync() await p.addArticleContent(content, key) async def init(filterFile, mode): url = "https://www.homegate.ch/rent/apartment/city-zurich/matching-list?o=dateCreated-desc&ep=" baseUrl = "://".join([urlparse(url).scheme, urlparse(url).netloc]) pageCrawl = 1 crawlStep = 10 p = parse(baseUrl) #get all different pages into parser class articleObj while True: runLst = [runAddArticles(i, p, url) for i in range(pageCrawl,pageCrawl+crawlStep)] result = await asyncio.gather(*runLst) pageCrawl += crawlStep if False in result: break #loop through article object make content requests in specific kinds popFilterArray = [] runLst = [] for key in p.articleObj: article = p.articleObj[key] match(filter(article, filterFile)): case 'all': if mode == 'full' : runLst.append(runAddArticleContent(article['lnk'], key, p)) case 'some': pass case 'none': popFilterArray.append(key) case 'important': p.articleObj[key]['prio'] = 'important' if mode == 'full' : runLst.append(runAddArticleContent(article['lnk'], key, p)) case 'pinned': if mode == 'full' : runLst.append(runAddArticleContent(article['lnk'], key, p)) p.articleObj[key]['prio'] = 'pinned' for item in popFilterArray: p.articleObj.pop(item) await asyncio.gather(*runLst) return p.articleObj class parse: def __init__(self, baseUrl): self.baseUrl = baseUrl self.articleObj = {} async def addArticles(self, content, pageCrawl): if pageCrawl > 1: articleCount = pageCrawl*20 else: articleCount = 0 Data = {} DOMdocument = BeautifulSoup(content, 'html.parser') results = DOMdocument.select('div[data-test="result-list"]') if results is None: connectionCheck = content.find('Error: Connection issue') if connectionCheck != -1: type = 'ERROR' msg = 'Connection issue' Data['INFO-OBJECT'] = { 'Titel': type, 'msg': msg, 'id': 'INFO-OBJECT', 'href': self.baseUrl, 'prio': 'pinned', 'content': 'none' } newArticleObj = {**self.articleObj, **Data} self.articleObj = newArticleObj return False try: resultList = results[0].select('[data-test="result-list-item"]') except: print('try list is none') return False if len(resultList) <= 0: #print(f'try list is 0 {results[0]}') return False for article in resultList: articleCount += 1 #room value try: room = article.findAll("span", class_=re.compile('ListItemRoomNumber_value_*'))[0].getText() except: room = 'none' if room == 'none': try: room = article.findAll(class_=re.compile('HgListingRoomsLivingSpace_roomsLivingSpace_*'))[0].findAll('span')[0].getText() size = article.findAll(class_=re.compile('HgListingRoomsLivingSpace_roomsLivingSpace_*'))[0].findAll('span')[1].getText() room = room + size except: room = 'none' #get description string try: desc = article.findAll(class_=re.compile('ListItemDescription_'))[0].getText() if desc == None: desc = 'none' except: desc = 'none' if desc == 'none': try: desc = article.findAll(class_=re.compile('ListItemDescription_description_'))[0].getText() if desc == None: desc = 'none' except: desc = 'none' if desc == 'none': try: desc = article.findAll(class_=re.compile('HgListingDescription_small_*'))[0].getText() if desc == None: desc = 'none' except: desc = 'none' #get title string try: title = article.findAll('p', class_=re.compile('HgListingDescription_title_*'))[0].getText() if title == None: title = 'none' except: title = 'none' if title == 'none': res = re.search('\D\.', desc) if res == None: title = desc else: pos = res.span()[1] title = desc[:pos] length = 100 if len(title) > length: if (title.find(' ', length) != -1): length = title.find(' ', length) title = title[:length] + ' ...' title = title[:-5]+' ...' if title == None: title = 'none' #get price value try: price = article.findAll(class_=re.compile('ListItemPrice_price_'))[0].getText() if price == None: price = 'none' except: price = 'none' if price == 'none': try: price = article.findAll(class_=re.compile('HgListingCard_price_*'))[0].getText() price = price.split('/')[0] if price == None: price = 'none' except: price = 'none' #get address address = 'none' for line in article.findAll('p'): string = line.getText() result = re.search('80\d\d Z', string) if result != None: address = string if address == 'none': try: address = article.findAll(class_=re.compile('HgListingCard_address_*'))[0].getText() if address == None: address = 'none' except: address = 'none' #get image link try: image = article.findAll('img')[0].get('src') except: image = 'none' if image == None: image = 'none' #get link href = article.get('href') if href == None: href = article.find('a').get('href') if href == None: href = "/no/none" #get site reference siId = href.split('/')[len(href.split('/'))-1] #create full link lnk = "".join([self.baseUrl, href]) #create unique id id = hashlib.md5("-".join([room, price, title, address, siId]).encode('utf-8')).hexdigest() Data[id] = { 'Zimmer': room, 'Preis': price, 'Titel': title, 'Adresse': address, 'Beschreibung': desc, 'id': id, 'siId': siId, 'href': href, 'lnk': lnk, 'prio': 'normal', 'content': 'none', 'number': articleCount, 'imgUrl': image } if id in self.articleObj.keys(): print(f'found dublicate: {siId}') newArticleObj = {**self.articleObj, **Data} self.articleObj = newArticleObj return True async def addArticleContent(self, content, id, mode='add'): DOMdocument = BeautifulSoup(content, 'html.parser') results = DOMdocument.find("div", class_=re.compile('DetailPage_detailPageContent_*')) if results is None: return False location = '' beginn = '' apply = '' contact = '' image = '' try: image = results.select('li[data-glide-index="0"]')[0].find('img').get('src') except: image = None if image == None: image = 'none' Data = { 'Ort': location, 'Antritt': beginn, 'Anmelden': apply, 'Kontakt': contact, 'imgUrl': image } if mode == 'add': self.articleObj[id]['content'] = Data if mode == 'return': return Data