diff --git a/.gitignore b/.gitignore index a41cf6cd7..98f7f8359 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,4 @@ courses20.xml compose-dev.yaml rpi_data/get-summer-2023-2.sh rpi_data/summer-20232.csv +.venv \ No newline at end of file diff --git a/docker-compose.development.yml b/docker-compose.development.yml index 038529a8b..c44757d8a 100644 --- a/docker-compose.development.yml +++ b/docker-compose.development.yml @@ -28,12 +28,12 @@ services: - ./src/web:/app - web_node_modules:/app/node_modules/ environment: - - YACS_API_HOST=http://yacs_api:5000 + - YACS_API_HOST=http://yacs_api:4000 yacs_api: - command: /bin/bash -c "python tables/database_session.py && PYTHONPATH=. alembic upgrade head && uvicorn app:app --reload --host 0.0.0.0 --port 5000" + command: /bin/bash -c "python tables/database_session.py && PYTHONPATH=. alembic upgrade head && uvicorn app:app --reload --host 0.0.0.0 --port 4000" ports: - - 5000:5000 + - 4000:4000 volumes: - ./src/api:/usr/src environment: @@ -55,3 +55,13 @@ services: - POSTGRES_DB=yacs - POSTGRES_USER=yacs - POSTGRES_PASSWORD=${DB_PASS:-easy_dev_pass} + + yacs_cron: + ports: + - 4321:4321 + volumes: + - ./src/cron:/usr/src + environment: + - YACS_API_HOST=http://yacs_api:4000 + - GECKO_PATH=/usr/local/bin/geckodriver + - API_SIGN_KEY=${API_SIGN_KEY:-secretKey} \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 4a49c90a3..51342c83b 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -30,3 +30,9 @@ services: container_name: yacs_db image: postgres:12-alpine + yacs_cron: + restart: unless-stopped + container_name: yacs_cron + build: + context: ./src/cron + dockerfile: Dockerfile \ No newline at end of file diff --git a/rpi_data/modules/parse_runner.py b/rpi_data/modules/parse_runner.py index 7bdd3ac29..8c16a7dc8 100644 --- a/rpi_data/modules/parse_runner.py +++ b/rpi_data/modules/parse_runner.py @@ -1,8 +1,8 @@ #!/usr/bin/env python from selenium import webdriver from selenium.webdriver.firefox.options import Options -import headless_login as login -import new_parse as parser +import cron.headless_login as login +import cron.new_parse as parser import sys from datetime import datetime import pytz diff --git a/src/api/Dockerfile b/src/api/Dockerfile index ba9cc1da9..4c46ab735 100644 --- a/src/api/Dockerfile +++ b/src/api/Dockerfile @@ -3,7 +3,7 @@ FROM python:3.8-slim RUN mkdir -p /usr/src WORKDIR /usr/src COPY ./requirements.txt /usr/src/ +RUN apt-get update && apt-get install -y libpq-dev build-essential RUN pip install --no-cache-dir -r requirements.txt COPY . /usr/src/ - -CMD [ "sh", "scripts/start.sh" ] +CMD [ "sh", "scripts/start.sh" ] \ No newline at end of file diff --git a/src/cron/Dockerfile b/src/cron/Dockerfile new file mode 100644 index 000000000..5ccbed60d --- /dev/null +++ b/src/cron/Dockerfile @@ -0,0 +1,23 @@ +# FROM selenium/standalone-firefox:latest +FROM python:3.9-slim + + +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + ca-certificates curl firefox-esr \ + && rm -fr /var/lib/apt/lists/* \ + && curl -L https://github.com/mozilla/geckodriver/releases/download/v0.30.0/geckodriver-v0.30.0-linux64.tar.gz | tar xz -C /usr/local/bin \ + && apt-get purge -y ca-certificates curl + +RUN apt-get update && apt-get -y install cron vim +COPY crontab /etc/cron.d/crontab +RUN chmod 0644 /etc/cron.d/crontab +RUN touch /var/log/cron.log + +RUN mkdir -p /usr/src +WORKDIR /usr/src +COPY ./requirements.txt /usr/src/ +RUN pip install --no-cache-dir -r requirements.txt +COPY . /usr/src/ + +CMD ["cron", "-f"] \ No newline at end of file diff --git a/rpi_data/modules/ci_scraper.py b/src/cron/ci_scraper.py similarity index 100% rename from rpi_data/modules/ci_scraper.py rename to src/cron/ci_scraper.py diff --git a/rpi_data/modules/course.py b/src/cron/course.py similarity index 100% rename from rpi_data/modules/course.py rename to src/cron/course.py diff --git a/rpi_data/modules/courses_scraper.py b/src/cron/courses_scraper.py similarity index 98% rename from rpi_data/modules/courses_scraper.py rename to src/cron/courses_scraper.py index 07360f82c..631a11a77 100644 --- a/rpi_data/modules/courses_scraper.py +++ b/src/cron/courses_scraper.py @@ -99,7 +99,12 @@ def scrape_single_course(prefix:str, code:str, nav: str, cat: str) -> dict: return dict() if "No courses found" in check.get_text(strip=True) or "" == check.get_text(strip=True): return dict() - nopop = check.find("a", {"aria-expanded": "false"}).get("href") # gets the link to the nopopup page + + element = check.find("a", {"aria-expanded": "false"}) + nopop = element.get("href") if element else None + if nopop is None: + return dict() + # nopop = check.find("a", {"aria-expanded": "false"}).get("href") # gets the link to the nopopup page ''' Beautiful soup for the nopopup page ''' diff --git a/src/cron/crontab b/src/cron/crontab new file mode 100644 index 000000000..165697479 --- /dev/null +++ b/src/cron/crontab @@ -0,0 +1 @@ +00 * * * * root /usr/local/bin/python3 /usr/src/no_login.py >> /var/log/cron.log 2>&1 diff --git a/rpi_data/modules/goldy_parse.py b/src/cron/goldy_parse.py similarity index 100% rename from rpi_data/modules/goldy_parse.py rename to src/cron/goldy_parse.py diff --git a/rpi_data/modules/headless_login.py b/src/cron/headless_login.py similarity index 100% rename from rpi_data/modules/headless_login.py rename to src/cron/headless_login.py diff --git a/rpi_data/modules/new_parse.py b/src/cron/new_parse.py similarity index 100% rename from rpi_data/modules/new_parse.py rename to src/cron/new_parse.py diff --git a/rpi_data/modules/no_login.py b/src/cron/no_login.py similarity index 92% rename from rpi_data/modules/no_login.py rename to src/cron/no_login.py index d6ca8a853..6c54c07c5 100644 --- a/rpi_data/modules/no_login.py +++ b/src/cron/no_login.py @@ -290,8 +290,9 @@ def date_split(date): # format dates ''' def no_login_scrape(term: str, num_browsers: int): options = Options() + services = webdriver.FirefoxService( executable_path=os.environ.get('GECKO_PATH', '/usr/local/bin/geckodriver') ) options.add_argument("--headless") - driver = webdriver.Firefox(options=options) # starter code which uses selenium + driver = webdriver.Firefox(options=options, service=services) # starter code which uses selenium subjects = old.findAllSubjectCodes(driver) # finds all subject codes nav, cat = cs.navigate_to_course(driver, term) # finds the navigation and catalog ids, which are each used to build a course search query. driver.quit() @@ -335,6 +336,7 @@ def no_login_scrape(term: str, num_browsers: int): parent = os.path.abspath(os.path.join(dir_path, os.pardir)) path = os.path.join(parent, number_to_term(term).lower().replace(" ", "-") + ".csv") old.writeCSV(courses, path) + return path ''' Scrapes the prerequisites for multiple courses at once. @@ -368,7 +370,31 @@ def add_goldy_info(course: Course, goldy_info: dict): course.raw = "Prerequisites: " + goldy_info[checking] if __name__ == "__main__": - no_login_scrape("202409", 15) - #driver = webdriver.Firefox() + print("Our test works at", datetime.now()) + + # options = Options() + # services = webdriver.FirefoxService( executable_path=os.environ.get('GECKO_PATH', '/usr/local/bin/geckodriver') ) + # options.add_argument("--headless") + # driver = webdriver.Firefox(options=options, service=services) + + # print(cs.scrape_single_course(driver, "MANE", "6990", 202509)) + + file = no_login_scrape("202509", 15) + fileName = os.path.basename(os.path.normpath(file)) + url = os.environ.get('YACS_API_HOST', 'http://yacs_api:4000') + payload = {'isPubliclyVisible': 'on'} + + files=[ + ('file',(fileName,open(file,'rb'),'text/csv')) + ] + + headers = { + 'X-API-KEY': os.environ.get('API_SIGN_KEY', None) + } + + resp = requests.post(url + '/api/bulkCourseUpload', headers=headers, data=payload, files=files) + print(resp.text) + + # driver = webdriver.Firefox() #print(cs.scrape_single_course(driver, "CSCI", "1100", 202409)) - #print(link_scrape("202409", "https://sis.rpi.edu/rss/bwckctlg.p_disp_listcrse?term_in=202409&subj_in=CHME&crse_in=4980&schd_in=L", "CHME")) + #print(link_scrape("202409", "https://sis.rpi.edu/rss/bwckctlg.p_disp_listcrse?term_in=202409&subj_in=CHME&crse_in=4980&schd_in=L", "CHME")) \ No newline at end of file diff --git a/src/cron/requirements.txt b/src/cron/requirements.txt new file mode 100644 index 000000000..de9ed7083 --- /dev/null +++ b/src/cron/requirements.txt @@ -0,0 +1,7 @@ +selenium==4.28.1 +beautifulsoup4==4.12.3 +bs4==0.0.2 +pypdf==5.1.0 +pandas==2.2.3 +requests==2.32.3 +regex==2024.11.6 \ No newline at end of file