197 lines
6.2 KiB
Python
197 lines
6.2 KiB
Python
from bs4 import BeautifulSoup
|
|
import hashlib
|
|
import soup
|
|
import asyncio
|
|
import datetime
|
|
import dateparser
|
|
from urllib.parse import urlparse
|
|
from filter import filter
|
|
|
|
async def runAddArticleContent(url: str, key: str, p):
|
|
s = soup.serve(url)
|
|
content = await s.htmlAsync()
|
|
await p.addArticleContent(content, key)
|
|
|
|
async def init(filterFile, mode):
|
|
url = "https://www.stadt-zuerich.ch/content/sd/de/index/ueber_das_departement/arbeiten_sd/offene_stellen_sd.html?taetigkeitsbereich=&q=&departement=Sozialdepartement&beschaeftigungsgrad=0&publikationsdatum=0&dienstabteilung=Soziale+Dienste&jobPage=1&stellentyp=&kaderstellen=false&q_type=jobs#moreJobs"
|
|
baseUrl = "://".join([urlparse(url).scheme, urlparse(url).netloc])
|
|
|
|
#init parse sozialinfo and provide base url = https://sozialinfo.net
|
|
p = parse(baseUrl)
|
|
|
|
#get all different pages into parser class articleObj
|
|
s = soup.serve(url)
|
|
await p.addArticles(s.html())
|
|
|
|
|
|
#loop through article object make content requests in specific kinds
|
|
|
|
popFilterArray = []
|
|
runLst = []
|
|
for key in p.articleObj:
|
|
article = p.articleObj[key]
|
|
|
|
match(filter(article, filterFile)):
|
|
case 'all':
|
|
if mode == 'full' :
|
|
runLst.append(runAddArticleContent(article['lnk'], key, p))
|
|
case 'some':
|
|
pass
|
|
case 'none':
|
|
popFilterArray.append(key)
|
|
case 'important':
|
|
p.articleObj[key]['prio'] = 'important'
|
|
if mode == 'full' :
|
|
runLst.append(runAddArticleContent(article['lnk'], key, p))
|
|
case 'pinned':
|
|
if mode == 'full' :
|
|
runLst.append(runAddArticleContent(article['lnk'], key, p))
|
|
p.articleObj[key]['prio'] = 'pinned'
|
|
|
|
for item in popFilterArray:
|
|
p.articleObj.pop(item)
|
|
|
|
await asyncio.gather(*runLst)
|
|
|
|
return p.articleObj
|
|
|
|
|
|
|
|
|
|
|
|
class parse:
|
|
|
|
def __init__(self, baseUrl):
|
|
self.baseUrl = baseUrl
|
|
self.articleObj = {}
|
|
|
|
async def addArticles(self, content):
|
|
Data = {}
|
|
|
|
DOMdocument = BeautifulSoup(content, 'html.parser')
|
|
|
|
results = DOMdocument.find(class_='mod_table')
|
|
|
|
if results is None:
|
|
|
|
connectionCheck = content.find('Error: Connection issue')
|
|
if connectionCheck != -1:
|
|
type = 'ERROR'
|
|
msg = 'Connection issue'
|
|
Data['INFO-OBJECT'] = {
|
|
'Titel': type,
|
|
'msg': msg,
|
|
'id': 'INFO-OBJECT',
|
|
'href': self.baseUrl,
|
|
'prio': 'pinned',
|
|
'content': 'none'
|
|
}
|
|
newArticleObj = {**self.articleObj, **Data}
|
|
self.articleObj = newArticleObj
|
|
|
|
return False
|
|
|
|
if len(results.find_all("td")) <= 0:
|
|
return False
|
|
|
|
for article in results.find_all("td"):
|
|
|
|
age = article.find_all('p', class_='date noborder')[0].contents[0].text.replace(u'\xa0', u'').replace('\n', '').replace(' ', '').split('|')[1]
|
|
|
|
age = dateparser.parse(age).strftime('%d.%m.%Y')
|
|
title = article.find('strong').string
|
|
|
|
type = 'Festanstellung'
|
|
if 'befristet' in title:
|
|
type = 'Befristete Stelle'
|
|
if 'Stundenlohn' in title:
|
|
type = 'Stundenlohn'
|
|
if 'Leiter' in title:
|
|
type = 'Kader'
|
|
|
|
company = 'Stadt Zürich Soziale Dienste'
|
|
id = hashlib.md5("-".join([type, age, title, company]).encode('utf-8')).hexdigest()
|
|
href = article.find('a').get('href')
|
|
siId = href.split('.')[len(href.split('.'))-2]
|
|
lnk = "".join([self.baseUrl, href])
|
|
|
|
Data[id] = {
|
|
'Anstellungsart': type,
|
|
'Alter': age,
|
|
'Titel': title,
|
|
'Firma': company,
|
|
'id': id,
|
|
'siId': siId,
|
|
'href': href,
|
|
'lnk': lnk,
|
|
'prio': 'normal',
|
|
'content': 'none'
|
|
}
|
|
|
|
newArticleObj = {**self.articleObj, **Data}
|
|
self.articleObj = newArticleObj
|
|
return True
|
|
|
|
async def addArticleContent(self, content, id):
|
|
|
|
DOMdocument = BeautifulSoup(content, 'html.parser')
|
|
|
|
results = DOMdocument.find(class_='mod_jobdetails')
|
|
|
|
location = ''
|
|
|
|
if results is None:
|
|
return False
|
|
|
|
if len(results.find_all(class_="mod_accordion")) <= 0:
|
|
return False
|
|
|
|
listItems = results.find_all(class_="mod_accordion")
|
|
descArray = []
|
|
for i in range(len(listItems)):
|
|
title = listItems[i].find_all('span', class_="trigger_title")[0].text
|
|
descString = listItems[i].find_all(class_="mod_jobdescription mod_styledtext")
|
|
jobdesc = ''
|
|
for j in descString:
|
|
jobdesc = jobdesc + '' + j.get_text()
|
|
descArray.append(title + '\n' + jobdesc + '\n')
|
|
|
|
desc = '\n'.join(descArray)
|
|
apply = results.find(id='bewerben').get('href')
|
|
|
|
|
|
self.articleObj[id]['content'] = {
|
|
'Ort': locale(results),
|
|
'Antritt': beginn(results),
|
|
'Beschreibung': desc,
|
|
'Anmelden': apply,
|
|
'Kontaktperson': 'none'
|
|
}
|
|
|
|
def beginn(results):
|
|
for item in results.find_all('p'):
|
|
txt = item.text
|
|
index = txt.find(' suchen wir ')
|
|
if index != -1:
|
|
index = index + 14
|
|
dateString = txt[index:]
|
|
dataArray = dateString.split(' ')
|
|
date = ' '.join([dataArray[1], dataArray[2], dataArray[3]])
|
|
return date
|
|
return 'none'
|
|
|
|
def locale(results):
|
|
for item in results.find_all('p'):
|
|
txt = item.text
|
|
pos = txt.find(' suchen wir ')
|
|
location = ''
|
|
if pos != -1:
|
|
i = ''
|
|
itemArray = txt.split(' ')
|
|
for i in range(len(itemArray)):
|
|
if itemArray[i] == 'suchen':
|
|
break
|
|
location = location + ' ' + itemArray[i]
|
|
return location
|
|
|
|
return 'none' |