import asyncio from bs4 import BeautifulSoup import hashlib import soup from urllib.parse import urlparse from filter import filter async def runAddArticles(pageCrawl: str, p, url): #print(f'launching for page: {pageCrawl}') pageUrl = url + str(pageCrawl) s = soup.serve(pageUrl) content = await s.htmlAsync() responseFlag = await p.addArticles(content) return responseFlag async def runAddArticleContent(url: str, key: str, p): s = soup.serve(url) content = await s.htmlAsync() await p.addArticleContent(content, key) async def init(filterFile, mode): url = "https://www.sozialinfo.ch/jobs/erweiterte-suche?extsearch[filter][]=canton:Z%C3%BCrich&extsearch[filter][]=percentOf:0-100&extsearch[page]=" baseUrl = "://".join([urlparse(url).scheme, urlparse(url).netloc]) pageCrawl = 1 crawlStep = 10 p = parse(baseUrl) #get all different pages into parser class articleObj while True: runLst = [runAddArticles(i, p, url) for i in range(pageCrawl,pageCrawl+crawlStep)] result = await asyncio.gather(*runLst) pageCrawl += crawlStep if False in result: break #loop through article object make content requests in specific kinds popFilterArray = [] runLst = [] for key in p.articleObj: article = p.articleObj[key] match(filter(article, filterFile)): case 'all': if mode == 'full' : runLst.append(runAddArticleContent(article['lnk'], key, p)) case 'some': pass case 'none': popFilterArray.append(key) case 'important': if mode == 'full' : runLst.append(runAddArticleContent(article['lnk'], key, p)) p.articleObj[key]['prio'] = 'important' case 'pinned': if mode == 'full' : runLst.append(runAddArticleContent(article['lnk'], key, p)) p.articleObj[key]['prio'] = 'pinned' for item in popFilterArray: p.articleObj.pop(item) await asyncio.gather(*runLst) return p.articleObj class parse: def __init__(self, baseUrl): self.baseUrl = baseUrl self.articleObj = {} async def addArticles(self, content): Data = {} DOMdocument = BeautifulSoup(content, 'html.parser') results = DOMdocument.find(id='results-list') if results is None: captchaCheck = content.find('Stellenangebote') if captchaCheck == -1: type = 'ERROR' msg = 'Captcha solving needed' Data['INFO-OBJECT'] = { 'Titel': type, 'msg': msg, 'id': 'INFO-OBJECT', 'href': self.baseUrl, 'prio': 'pinned', 'content': 'none', 'data': content } newArticleObj = {**self.articleObj, **Data} self.articleObj = newArticleObj connectionCheck = content.find('Error: Connection issue') if connectionCheck != -1: type = 'ERROR' msg = 'Connection issue' Data['INFO-OBJECT'] = { 'Titel': type, 'msg': msg, 'id': 'INFO-OBJECT', 'href': self.baseUrl, 'prio': 'pinned', 'content': 'none' } newArticleObj = {**self.articleObj, **Data} self.articleObj = newArticleObj return False if len(results.find_all("article")) <= 0: return False for article in results.find_all("article"): type = article.find_all('div', class_='info-bar')[0].contents[1].text age = article.find_all('div', class_='info-bar')[0].contents[3].text.split('\n')[1].split()[0] title = ' '.join(' '.join(article.find_all('div', class_='content-body')[0].find('h3').text.split('\n')).split()) company = article.find_all('div', class_='footer')[0].find_all('span', class_='text-bold')[0].text id = hashlib.md5("-".join([type, age, title, company]).encode('utf-8')).hexdigest() href = article.find('a').get('href') siId = href.split('-')[len(href.split('-'))-1] lnk = "".join([self.baseUrl, href]) Data[id] = { 'Anstellungsart': type, 'Alter': age, 'Titel': title, 'Firma': company, 'id': id, 'siId': siId, 'href': href, 'lnk': lnk, 'prio': 'normal', 'content': 'none' } newArticleObj = {**self.articleObj, **Data} self.articleObj = newArticleObj return True async def addArticleContent(self, content, id): DOMdocument = BeautifulSoup(content, 'html.parser') results = DOMdocument.find(id='jobsummary') location = '' if results is None: return False if len(results.find_all("li")) <= 0: return False listItems = results.find_all("li") for i in range(len(listItems)): spans = listItems[i].find_all('span') for j in range(len(spans)): if spans[j].text == 'Stellenantritt': beginn = ' '.join(listItems[i].text.replace('Stellenantritt', '').split()) if spans[j].text == 'Arbeitsort': try: location = ' '.join(spans[j+2].text.split()) except: location = 'none' results = DOMdocument.find_all(class_='detailDescription') if len(results[0].find_all("div")) <= 0: return False div = results[0].find_all("div")[0] desc = ''.join(div.text).replace('\n', '') desc = desc.replace('Stellenbeschrieb', '') result = DOMdocument.find(id='apply-now') apply = result.get('href') results = DOMdocument.find_all(class_='workingAreaLink') field = [] for i in range(len(results)): field.append(results[i].get_text()) self.articleObj[id]['content'] = { 'Ort': location, 'Antritt': beginn, 'Beschreibung': desc, 'Anmelden': apply, 'Arbeitsbereich': field, 'Kontaktperson': 'none' }