import asyncio import json from pprint import pprint from bs4 import BeautifulSoup import hashlib import soup from urllib.parse import urlparse from filter import filter async def runAddArticles(pageCrawl: str, p, url): #print(f'launching for page: {pageCrawl}') pageUrl = url + str(pageCrawl) s = soup.serve(pageUrl) content = await s.htmlAsync() responseFlag = await p.addArticles(content) return responseFlag async def runAddArticleContent(url: str, key: str, p): s = soup.serve(url) content = await s.htmlAsync() await p.addArticleContent(content, key) async def init(filterFile, mode): url = "https://www.sozialinfo.ch/jobs?qualifications%5B%5D=Hochschulbildung&employments%5B%5D=Festanstellung&locations%5B%5D=region.58412&page=" baseUrl = "://".join([urlparse(url).scheme, urlparse(url).netloc]) pageCrawl = 1 crawlStep = 10 p = parse(baseUrl) #get all different pages into parser class articleObj while True: runLst = [runAddArticles(i, p, url) for i in range(pageCrawl,pageCrawl+crawlStep)] result = await asyncio.gather(*runLst) pageCrawl += crawlStep if False in result: break #loop through article object make content requests in specific kinds popFilterArray = [] runLst = [] for key in p.articleObj: article = p.articleObj[key] match(filter(article, filterFile)): case 'all': if mode == 'full' : runLst.append(runAddArticleContent(article['lnk'], key, p)) case 'some': pass case 'none': popFilterArray.append(key) case 'important': if mode == 'full' : runLst.append(runAddArticleContent(article['lnk'], key, p)) p.articleObj[key]['prio'] = 'important' case 'pinned': if mode == 'full' : runLst.append(runAddArticleContent(article['lnk'], key, p)) p.articleObj[key]['prio'] = 'pinned' for item in popFilterArray: p.articleObj.pop(item) await asyncio.gather(*runLst) return p.articleObj class parse: def __init__(self, baseUrl): self.baseUrl = baseUrl self.articleObj = {} async def addArticles(self, content): Data = {} DOMdocument = BeautifulSoup(content, 'html.parser') results = DOMdocument.find(id='app') try: results = json.loads(results['data-initial-state']) except: type = 'ERROR' msg = 'Couldnt parse app json data' Data['INFO-OBJECT'] = { 'Titel': type, 'msg': msg, 'id': 'INFO-OBJECT', 'href': self.baseUrl, 'prio': 'pinned', 'content': 'none' } newArticleObj = {**self.articleObj, **Data} self.articleObj = newArticleObj return False if results is None: captchaCheck = content.find('Stellenangeboten') if captchaCheck == -1: type = 'ERROR' msg = 'Captcha solving needed' Data['INFO-OBJECT'] = { 'Titel': type, 'msg': msg, 'id': 'INFO-OBJECT', 'href': self.baseUrl, 'prio': 'pinned', 'content': 'none', 'data': content } newArticleObj = {**self.articleObj, **Data} self.articleObj = newArticleObj connectionCheck = content.find('Error: Connection issue') if connectionCheck != -1: type = 'ERROR' msg = 'Connection issue' Data['INFO-OBJECT'] = { 'Titel': type, 'msg': msg, 'id': 'INFO-OBJECT', 'href': self.baseUrl, 'prio': 'pinned', 'content': 'none' } newArticleObj = {**self.articleObj, **Data} self.articleObj = newArticleObj return False if len(results) <= 0: return False resFlag = False for i in results: try: resultsList = results[i] for article in resultsList: type = article['position'] age = '' try: age = article['sortingTime'][0:10] age = age.split("-") age = '.'.join(age[::-1]) except: age = article['sortingTime'][0:10] title = article['title'] company = article['organisation'] id = hashlib.md5("-".join([type, age, title, company]).encode('utf-8')).hexdigest() href = article['url'] siId = article['uid'] lnk = "".join([self.baseUrl, href]) Data[id] = { 'Anstellungsart': type, 'Alter': age, 'Titel': title, 'Firma': company, 'id': id, 'siId': siId, 'href': href, 'lnk': lnk, 'prio': 'normal', 'content': 'none' } resFlag = True except: pass newArticleObj = {**self.articleObj, **Data} self.articleObj = newArticleObj return resFlag async def addArticleContent(self, content, id): DOMdocument = BeautifulSoup(content, 'html.parser') results = DOMdocument.find(id='jobsummary') location = '' if results is None: return False if len(results.find_all("li")) <= 0: return False listItems = results.find_all("li") for i in range(len(listItems)): spans = listItems[i].find_all('span') for j in range(len(spans)): if spans[j].text == 'Stellenantritt': beginn = ' '.join(listItems[i].text.replace('Stellenantritt', '').split()) if spans[j].text == 'Arbeitsort': try: location = ' '.join(spans[j+2].text.split()) except: location = 'none' results = DOMdocument.find_all(class_='detailDescription') if len(results[0].find_all("div")) <= 0: return False div = results[0].find_all("div")[0] desc = ''.join(div.text).replace('\n', '') desc = desc.replace('Stellenbeschrieb', '') result = DOMdocument.find(id='apply-now') apply = result.get('href') results = DOMdocument.find_all(class_='workingAreaLink') field = [] for i in range(len(results)): field.append(results[i].get_text()) self.articleObj[id]['content'] = { 'Ort': location, 'Antritt': beginn, 'Beschreibung': desc, 'Anmelden': apply, 'Arbeitsbereich': field, 'Kontaktperson': 'none' }