batchGame/bin/parseStzh.py

from bs4 import BeautifulSoup
import hashlib
import soup
import asyncio
import datetime
import dateparser
from urllib.parse import urlparse
from filter import filter

async def runAddArticleContent(url: str, key: str, p):
    s = soup.serve(url)
    content = await s.htmlAsync()
    await p.addArticleContent(content, key)

async def init(filterFile, mode):
    url = "https://www.stadt-zuerich.ch/content/sd/de/index/ueber_das_departement/arbeiten_sd/offene_stellen_sd.html?taetigkeitsbereich=&q=&departement=Sozialdepartement&beschaeftigungsgrad=0&publikationsdatum=0&dienstabteilung=Soziale+Dienste&jobPage=1&stellentyp=&kaderstellen=false&q_type=jobs#moreJobs"
    baseUrl = "://".join([urlparse(url).scheme, urlparse(url).netloc])

    #init parse sozialinfo and provide base url = https://sozialinfo.net
    p = parse(baseUrl)

    #get all different pages into parser class articleObj
    s = soup.serve(url)
    await p.addArticles(s.html())


    #loop through article object make content requests in specific kinds

    popFilterArray = []
    runLst = []
    for key in p.articleObj:
        article = p.articleObj[key]

        match(filter(article, filterFile)):
            case 'all':
                if mode == 'full' :
                    runLst.append(runAddArticleContent(article['lnk'], key, p))
            case 'some':
                pass
            case 'none':
                popFilterArray.append(key)
            case 'important':
                p.articleObj[key]['prio'] = 'important'
                if mode == 'full' :
                    runLst.append(runAddArticleContent(article['lnk'], key, p))
            case 'pinned':
                if mode == 'full' :
                    runLst.append(runAddArticleContent(article['lnk'], key, p))
                p.articleObj[key]['prio'] = 'pinned'

    for item in popFilterArray:
        p.articleObj.pop(item)

    await asyncio.gather(*runLst)

    return p.articleObj


class parse:

    def __init__(self, baseUrl):
        self.baseUrl = baseUrl
        self.articleObj = {}

    async def addArticles(self, content):
        Data = {}

        DOMdocument = BeautifulSoup(content, 'html.parser')

        results = DOMdocument.find(class_='mod_table')

        if results is None:

            connectionCheck = content.find('Error: Connection issue')
            if connectionCheck != -1:
                type = 'ERROR'
                msg = 'Connection issue'
                Data['INFO-OBJECT'] = {
                    'Titel': type,
                    'msg': msg,
                    'id': 'INFO-OBJECT',
                    'href': self.baseUrl,
                    'prio': 'pinned',
                    'content': 'none'
                }
                newArticleObj = {**self.articleObj, **Data}
                self.articleObj = newArticleObj

            return False

        if len(results.find_all("td")) <= 0:
            return False

        for article in results.find_all("td"):

            age = article.find_all('p', class_='date noborder')[0].contents[0].text.replace(u'\xa0', u'').replace('\n', '').replace('  ', '').split('|')[1]

            age = dateparser.parse(age).strftime('%d.%m.%Y')
            title = article.find('strong').string

            type = 'Festanstellung'
            if 'befristet' in title:
                type = 'Befristete Stelle'
            if 'Stundenlohn' in title:
                type = 'Stundenlohn'
            if 'Leiter' in title:
                type = 'Kader'

            company = 'Stadt Zürich Soziale Dienste'
            id = hashlib.md5("-".join([type, age, title, company]).encode('utf-8')).hexdigest()
            href = article.find('a').get('href')
            siId = href.split('.')[len(href.split('.'))-2]
            lnk = "".join([self.baseUrl, href])

            Data[id] = {
                'Anstellungsart': type,
                'Alter': age,
                'Titel': title,
                'Firma': company,
                'id': id,
                'siId': siId,
                'href': href,
                'lnk': lnk,
                'prio': 'normal',
                'content': 'none'
            }

        newArticleObj = {**self.articleObj, **Data}
        self.articleObj = newArticleObj
        return True

    async def addArticleContent(self, content, id):

        DOMdocument = BeautifulSoup(content, 'html.parser')

        results = DOMdocument.find(class_='mod_jobdetails')

        location = ''

        if results is None:
            return False

        if len(results.find_all(class_="mod_accordion")) <= 0:
            return False

        listItems = results.find_all(class_="mod_accordion")
        descArray = []
        for i in range(len(listItems)):
            title = listItems[i].find_all('span', class_="trigger_title")[0].text
            descString = listItems[i].find_all(class_="mod_jobdescription mod_styledtext")
            jobdesc = ''
            for j in descString:
                jobdesc = jobdesc + '' + j.get_text()
            descArray.append(title + '\n' + jobdesc + '\n')

        desc = '\n'.join(descArray)
        apply = results.find(id='bewerben').get('href')


        self.articleObj[id]['content'] = {
            'Ort': locale(results),
            'Antritt': beginn(results),
            'Beschreibung': desc,
            'Anmelden': apply,
            'Kontaktperson': 'none'
        }

def beginn(results):
    for item in results.find_all('p'):
        txt = item.text
        index = txt.find(' suchen wir ')
        if index != -1:
            index = index + 14
            dateString = txt[index:]
            dataArray = dateString.split(' ')
            date = ' '.join([dataArray[1], dataArray[2], dataArray[3]])
            return date
    return 'none'

def locale(results):
    for item in results.find_all('p'):
        txt = item.text
        pos = txt.find(' suchen wir ')
        location = ''
        if pos != -1:
            i = ''
            itemArray = txt.split(' ')
            for i in range(len(itemArray)):
                if itemArray[i] == 'suchen':
                    break
                location = location + ' ' + itemArray[i]
            return location

    return 'none'