batchGame/bin/parseStzh.py
2023-06-05 13:11:47 +02:00

197 lines
6.2 KiB
Python

from bs4 import BeautifulSoup
import hashlib
import soup
import asyncio
import datetime
import dateparser
from urllib.parse import urlparse
from filter import filter
async def runAddArticleContent(url: str, key: str, p):
s = soup.serve(url)
content = await s.htmlAsync()
await p.addArticleContent(content, key)
async def init(filterFile, mode):
url = "https://www.stadt-zuerich.ch/content/sd/de/index/ueber_das_departement/arbeiten_sd/offene_stellen_sd.html?taetigkeitsbereich=&q=&departement=Sozialdepartement&beschaeftigungsgrad=0&publikationsdatum=0&dienstabteilung=Soziale+Dienste&jobPage=1&stellentyp=&kaderstellen=false&q_type=jobs#moreJobs"
baseUrl = "://".join([urlparse(url).scheme, urlparse(url).netloc])
#init parse sozialinfo and provide base url = https://sozialinfo.net
p = parse(baseUrl)
#get all different pages into parser class articleObj
s = soup.serve(url)
await p.addArticles(s.html())
#loop through article object make content requests in specific kinds
popFilterArray = []
runLst = []
for key in p.articleObj:
article = p.articleObj[key]
match(filter(article, filterFile)):
case 'all':
if mode == 'full' :
runLst.append(runAddArticleContent(article['lnk'], key, p))
case 'some':
pass
case 'none':
popFilterArray.append(key)
case 'important':
p.articleObj[key]['prio'] = 'important'
if mode == 'full' :
runLst.append(runAddArticleContent(article['lnk'], key, p))
case 'pinned':
if mode == 'full' :
runLst.append(runAddArticleContent(article['lnk'], key, p))
p.articleObj[key]['prio'] = 'pinned'
for item in popFilterArray:
p.articleObj.pop(item)
await asyncio.gather(*runLst)
return p.articleObj
class parse:
def __init__(self, baseUrl):
self.baseUrl = baseUrl
self.articleObj = {}
async def addArticles(self, content):
Data = {}
DOMdocument = BeautifulSoup(content, 'html.parser')
results = DOMdocument.find(class_='mod_table')
if results is None:
connectionCheck = content.find('Error: Connection issue')
if connectionCheck != -1:
type = 'ERROR'
msg = 'Connection issue'
Data['INFO-OBJECT'] = {
'Titel': type,
'msg': msg,
'id': 'INFO-OBJECT',
'href': self.baseUrl,
'prio': 'pinned',
'content': 'none'
}
newArticleObj = {**self.articleObj, **Data}
self.articleObj = newArticleObj
return False
if len(results.find_all("td")) <= 0:
return False
for article in results.find_all("td"):
age = article.find_all('p', class_='date noborder')[0].contents[0].text.replace(u'\xa0', u'').replace('\n', '').replace(' ', '').split('|')[1]
age = dateparser.parse(age).strftime('%d.%m.%Y')
title = article.find('strong').string
type = 'Festanstellung'
if 'befristet' in title:
type = 'Befristete Stelle'
if 'Stundenlohn' in title:
type = 'Stundenlohn'
if 'Leiter' in title:
type = 'Kader'
company = 'Stadt Zürich Soziale Dienste'
id = hashlib.md5("-".join([type, age, title, company]).encode('utf-8')).hexdigest()
href = article.find('a').get('href')
siId = href.split('.')[len(href.split('.'))-2]
lnk = "".join([self.baseUrl, href])
Data[id] = {
'Anstellungsart': type,
'Alter': age,
'Titel': title,
'Firma': company,
'id': id,
'siId': siId,
'href': href,
'lnk': lnk,
'prio': 'normal',
'content': 'none'
}
newArticleObj = {**self.articleObj, **Data}
self.articleObj = newArticleObj
return True
async def addArticleContent(self, content, id):
DOMdocument = BeautifulSoup(content, 'html.parser')
results = DOMdocument.find(class_='mod_jobdetails')
location = ''
if results is None:
return False
if len(results.find_all(class_="mod_accordion")) <= 0:
return False
listItems = results.find_all(class_="mod_accordion")
descArray = []
for i in range(len(listItems)):
title = listItems[i].find_all('span', class_="trigger_title")[0].text
descString = listItems[i].find_all(class_="mod_jobdescription mod_styledtext")
jobdesc = ''
for j in descString:
jobdesc = jobdesc + '' + j.get_text()
descArray.append(title + '\n' + jobdesc + '\n')
desc = '\n'.join(descArray)
apply = results.find(id='bewerben').get('href')
self.articleObj[id]['content'] = {
'Ort': locale(results),
'Antritt': beginn(results),
'Beschreibung': desc,
'Anmelden': apply,
'Kontaktperson': 'none'
}
def beginn(results):
for item in results.find_all('p'):
txt = item.text
index = txt.find(' suchen wir ')
if index != -1:
index = index + 14
dateString = txt[index:]
dataArray = dateString.split(' ')
date = ' '.join([dataArray[1], dataArray[2], dataArray[3]])
return date
return 'none'
def locale(results):
for item in results.find_all('p'):
txt = item.text
pos = txt.find(' suchen wir ')
location = ''
if pos != -1:
i = ''
itemArray = txt.split(' ')
for i in range(len(itemArray)):
if itemArray[i] == 'suchen':
break
location = location + ' ' + itemArray[i]
return location
return 'none'