from bs4 import BeautifulSoup import hashlib import soup import asyncio import datetime import dateparser from urllib.parse import urlparse from filter import filter async def runAddArticleContent(url: str, key: str, p): s = soup.serve(url) content = await s.htmlAsync() await p.addArticleContent(content, key) async def init(filterFile, mode): url = "https://www.stadt-zuerich.ch/content/sd/de/index/ueber_das_departement/arbeiten_sd/offene_stellen_sd.html?taetigkeitsbereich=&q=&departement=Sozialdepartement&beschaeftigungsgrad=0&publikationsdatum=0&dienstabteilung=Soziale+Dienste&jobPage=1&stellentyp=&kaderstellen=false&q_type=jobs#moreJobs" baseUrl = "://".join([urlparse(url).scheme, urlparse(url).netloc]) #init parse sozialinfo and provide base url = https://sozialinfo.net p = parse(baseUrl) #get all different pages into parser class articleObj s = soup.serve(url) await p.addArticles(s.html()) #loop through article object make content requests in specific kinds popFilterArray = [] runLst = [] for key in p.articleObj: article = p.articleObj[key] match(filter(article, filterFile)): case 'all': if mode == 'full' : runLst.append(runAddArticleContent(article['lnk'], key, p)) case 'some': pass case 'none': popFilterArray.append(key) case 'important': p.articleObj[key]['prio'] = 'important' if mode == 'full' : runLst.append(runAddArticleContent(article['lnk'], key, p)) case 'pinned': if mode == 'full' : runLst.append(runAddArticleContent(article['lnk'], key, p)) p.articleObj[key]['prio'] = 'pinned' for item in popFilterArray: p.articleObj.pop(item) await asyncio.gather(*runLst) return p.articleObj class parse: def __init__(self, baseUrl): self.baseUrl = baseUrl self.articleObj = {} async def addArticles(self, content): Data = {} DOMdocument = BeautifulSoup(content, 'html.parser') results = DOMdocument.find(class_='mod_table') if results is None: connectionCheck = content.find('Error: Connection issue') if connectionCheck != -1: type = 'ERROR' msg = 'Connection issue' Data['INFO-OBJECT'] = { 'Titel': type, 'msg': msg, 'id': 'INFO-OBJECT', 'href': self.baseUrl, 'prio': 'pinned', 'content': 'none' } newArticleObj = {**self.articleObj, **Data} self.articleObj = newArticleObj return False if len(results.find_all("td")) <= 0: return False for article in results.find_all("td"): age = article.find_all('p', class_='date noborder')[0].contents[0].text.replace(u'\xa0', u'').replace('\n', '').replace(' ', '').split('|')[1] age = dateparser.parse(age).strftime('%d.%m.%Y') title = article.find('strong').string type = 'Festanstellung' if 'befristet' in title: type = 'Befristete Stelle' if 'Stundenlohn' in title: type = 'Stundenlohn' if 'Leiter' in title: type = 'Kader' company = 'Stadt Zürich Soziale Dienste' id = hashlib.md5("-".join([type, age, title, company]).encode('utf-8')).hexdigest() href = article.find('a').get('href') siId = href.split('.')[len(href.split('.'))-2] lnk = "".join([self.baseUrl, href]) Data[id] = { 'Anstellungsart': type, 'Alter': age, 'Titel': title, 'Firma': company, 'id': id, 'siId': siId, 'href': href, 'lnk': lnk, 'prio': 'normal', 'content': 'none' } newArticleObj = {**self.articleObj, **Data} self.articleObj = newArticleObj return True async def addArticleContent(self, content, id): DOMdocument = BeautifulSoup(content, 'html.parser') results = DOMdocument.find(class_='mod_jobdetails') location = '' if results is None: return False if len(results.find_all(class_="mod_accordion")) <= 0: return False listItems = results.find_all(class_="mod_accordion") descArray = [] for i in range(len(listItems)): title = listItems[i].find_all('span', class_="trigger_title")[0].text descString = listItems[i].find_all(class_="mod_jobdescription mod_styledtext") jobdesc = '' for j in descString: jobdesc = jobdesc + '' + j.get_text() descArray.append(title + '\n' + jobdesc + '\n') desc = '\n'.join(descArray) apply = results.find(id='bewerben').get('href') self.articleObj[id]['content'] = { 'Ort': locale(results), 'Antritt': beginn(results), 'Beschreibung': desc, 'Anmelden': apply, 'Kontaktperson': 'none' } def beginn(results): for item in results.find_all('p'): txt = item.text index = txt.find(' suchen wir ') if index != -1: index = index + 14 dateString = txt[index:] dataArray = dateString.split(' ') date = ' '.join([dataArray[1], dataArray[2], dataArray[3]]) return date return 'none' def locale(results): for item in results.find_all('p'): txt = item.text pos = txt.find(' suchen wir ') location = '' if pos != -1: i = '' itemArray = txt.split(' ') for i in range(len(itemArray)): if itemArray[i] == 'suchen': break location = location + ' ' + itemArray[i] return location return 'none'