MoodleScraping utils

182c9eb4 · Karsa Zoltán István · 182c9eb4
Commit 182c9eb4 authored Aug 09, 2024 by Karsa Zoltán István
Hide whitespace changes
Inline Side-by-side

Showing with 123 additions and 0 deletions

moodle_scraping.py
+123 -0

No files found.
--- a/moodle_scraping.py
+++ b/moodle_scraping.py
+import requests
+import ssl
+import json
+import os
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin
+# az adott ZH eredményeire mutató összefoglaló link
+ILIAS_URL = 'https://edu.vik.bme.hu/mod/quiz/report.php?id=124407&mode=overview'
+ssl._create_default_https_context = ssl._create_unverified_context
+def shibboleth_auth(session, url, credentials):
+    print("Shibboleth Auth…")
+    print("├── request target resource")
+    response = session.get(url, verify=False, allow_redirects=True)
+    soup = BeautifulSoup(response.content, 'html.parser')
+    if link := soup.find('a', class_='login-identityprovider-btn'):
+        print("├── landing page")
+        response = session.get(link['href'], allow_redirects=True)
+        soup = BeautifulSoup(response.content.decode('UTF-16LE'), 'html.parser')
+    if soup.find('input', id='login-form_password'):
+        print("├── login credentials")
+        form = soup.find('form')
+        data = [
+            (name, value)
+            for name, value in get_form_data(form)
+            if name not in credentials
+        ]
+        data.extend(credentials.items())
+        data.append(('_eventId_proceed', ''))
+        response = session.post(
+            urljoin(response.url, form['action']), data=dict(data))
+        soup = BeautifulSoup(response.content, 'html.parser')
+    if soup.find('input', attrs={'name': '_shib_idp_consentIds'}):
+        print("├── grant permissions")
+        form = soup.find('form')
+        data = get_form_data(form)
+        response = session.post(
+            urljoin(response.url, form['action']), data=data)
+        soup = BeautifulSoup(response.content, 'html.parser')
+    if soup.find('input', attrs={'name': 'SAMLResponse'}):
+        print("├── forward login token")
+        form = soup.find('form')
+        data = get_form_data(form)
+        response = session.post(
+            urljoin(response.url, form['action']), data=data)
+    print("└── done")
+    return response
+def get_form_data(form):
+    return [
+        (elem['name'], elem['value'])
+        for elem in form.find_all('input', attrs={
+            'name': True,
+            'value': True,
+        })
+        if elem['type'] != 'submit' or elem['value'].lower() != 'reject'
+    ]
+# CodeRunner típusú response history-k mentésére (különben csak az utolsó válasz tölthető le)
+def main():
+    with requests.Session() as session:
+        shibboleth_auth(session, ILIAS_URL, {
+            # BME címtár belépési adatok
+            'j_username': print(os.environ['USER']),
+            'j_password': print(os.environ['PASS']),
+            '_shib_idp_revokeConsent': '1',
+        })
+        response = session.get(ILIAS_URL)
+        content = response.content.decode("UTF-8")
+        soup = BeautifulSoup(content, 'html.parser')
+        responses = soup.find_all("td", class_="cell c4") # neptun oszlopának class azonja
+        neptuns = [a.get_text() for a in responses if len(a.get_text()) == 6]
+        responses = soup.find_all("td", class_="cell c27", limit=len(neptuns)) # az adott részkérdés (CodeRunner) oszlopának class azonja
+        hrefs = [a.find("a" , recursive=False)["href"] for a in responses]
+        # CodeRunner típusú response history-k mentésére (különben csak az utolsó válasz tölthető le)
+        todict = { }
+        for i in range(0, len(neptuns)):
+            neptun = neptuns[i]
+            url = hrefs[i]
+            response = session.get(url)
+            content = response.content.decode("UTF-8")
+            soup = BeautifulSoup(content, 'html.parser')
+            soup = soup.find("div", class_="responsehistoryheader")
+            responses = soup.find_all("td", class_="cell c1")
+            times = [a.get_text() for a in responses]
+            responses = soup.find_all("td", class_="cell c2")
+            codes = [a.get_text() for a in responses]
+            responses = soup.find_all("td", class_="cell c3")
+            states = [a.get_text() for a in responses]
+            responses = soup.find_all("td", class_="cell c4 lastcol")
+            marks = [a.get_text() for a in responses]
+            todict[neptun] = {
+                "url": url,
+                "times": times,
+                "codes": codes,
+                "states": states,
+                "marks": marks
+            }
+        with open("ZH2Ac27.json", 'w', encoding ='utf8') as json_file:
+            json.dump(todict, json_file)
+main()
\ No newline at end of file