Giter Site home page Giter Site logo

Comments (1)

sweep-ai avatar sweep-ai commented on September 25, 2024

Here's the PR! #10.

⚡ Sweep Free Trial: I used GPT-3.5 to create this ticket. You have 3 GPT-4 tickets left for the month and 0 for the day. For more GPT-4 tickets, visit our payment portal.To get Sweep to recreate this ticket, leave a comment prefixed with "sweep:" or edit the issue.


Step 1: 🔍 Code Search

I found the following snippets in your repository. I will now analyze these snippets and come up with a plan.

Some code snippets I looked at (click to expand). If some file is missing from here, you can mention the path in the ticket description.

import os
import re
from datetime import datetime
from bs4 import BeautifulSoup
from sanitize_filename import sanitize
class Course:
"""
Class for storing course information.
"""
def __init__(self, course_url: str) -> None:
self.course_url = course_url
self.id = self.course_url.split("id")[1][1:].split("&")[0]
self.files = []
def __str__(self) -> str:
return f"[{self.course_code}] {self.course_name}"
__repr__ = __str__
@staticmethod
def get_course_regex() -> re.Pattern:
return re.compile(r"\n*[\(][\|]([^\|]*)[\|][\)]([^\(]*)[\(].*\n*")
@property
def course_code(self) -> None:
return self.course_text.split("-")[0].strip()
@property
def course_name(self) -> None:
return self.course_text.split("-")[1].strip()
def set_course_text(self, course_text: str) -> None:
"""
Set the course text. (e.g. "CS 201 - Programming 1")
course code + course name
:param course_text: The course text.
"""
self.course_text = course_text
def create_course_directory(self) -> None:
for file in self.files:
os.makedirs(os.path.join(file.dir_path), exist_ok=True)
def set_course_soup(self, course_soup: BeautifulSoup) -> None:
self.course_soup = course_soup
def get_course_files(self, course_path) -> None:
"""
Get the list of files in the course.
"""
files_body = self.course_soup.find_all(class_="card-body")
for item in files_body:
# check if the card is not a course content, useful for `Filter weeks` card
if item.find('strong') is None:
continue
self.files.append((CMSFile(soup=item, course_path=course_path)))
class CMSFile:
"""a cms file object"""
def __init__(self, soup: BeautifulSoup, course_path) -> None:
from scraper import HOST
self.soup = soup
self.url = HOST + self.soup.find("a")["href"]
self.week = self.soup.parent.parent.parent.parent.find("h2").text.strip()
self.week = re.sub(r"Week: (.*)", "\\1", self.week)
self.week = datetime.strptime(self.week, "%Y-%m-%d").strftime("W %m-%d")
self.description = re.sub(self.get_file_regex(), "\\1", self.soup.find("div").text).strip()
self.name = re.sub(self.get_file_regex(), "\\1", self.soup.find("strong").text).strip()
self.name = sanitize(self.name)
self.extension = self.url.rsplit(".", 1)[1]
self.dir_path = os.path.join(course_path, self.week)
self.path = os.path.join(self.dir_path, f"{self.name}.{self.extension}")
@staticmethod
def get_file_regex() -> re.Pattern:
return re.compile(r"[0-9]* - (.*)")
def __str__(self) -> str:
return f"{self.name}"
__repr__ = __str__

import threading
import os
import random
import re
import json
from typing import Dict, List
import requests
import yaml
from bs4 import BeautifulSoup
from course import CMSFile, Course
from requests_ntlm import HttpNtlmAuth
from tqdm import tqdm
from auth import Credentials, CMSAuthenticationError
YML_FILE = "config.yml"
YML_CONFIG = yaml.safe_load(open(YML_FILE))
HOST = YML_CONFIG["host"]
DOWNLOADS_DIR = YML_CONFIG["downloads_dir"]
TQDM_COLORS = [
"#ff0000",
"#00ff00",
"#0000ff",
"#ffff00",
"#00ffff",
"#ff00ff",
"#ffffff",
"#000000",
]
class Scraper:
"""
Class for scraping data from GUC CMS.
"""
def __init__(self, credentials: Credentials):
self.credentials: Credentials = credentials
self.session: requests.Session = requests.Session()
self.session.auth = HttpNtlmAuth(credentials.username, credentials.password)
self.session.headers.update({"User-Agent": "Mozilla/5.0"})
self.html_parser: str = "html.parser"
self.get_args: Dict[str, object] = {
"auth": self.session.auth,
"verify": False,
}
@property
def home_soup(self) -> BeautifulSoup:
"""
Get home page.
"""
return BeautifulSoup(self.session.get(HOST, **self.get_args).text, self.html_parser)
def run(self) -> None:
"""
Run the scraper.
"""
# authenticate
try:
self.authenticate()
except CMSAuthenticationError:
self.credentials.remove_credentials()
return self.run()
self.__scrap_courses()
self.__scrap_files()
self.__create_courses_dir()
self.__download_all_files()
def __download_all_files(self):
# download files in parallel using threads
threads = []
for file in self.files:
thread = threading.Thread(target=self.__download_file, args=(file,))
thread.start()
threads.append(thread)
def __create_courses_dir(self):
for course in self.courses:
course.create_course_directory()
def __scrap_courses(self) -> None:
# cache the courses name and links
if os.path.exists(".courses.json"):
self.__get_cached_courses()
else:
self.__cache_courses()
def __cache_courses(self):
self.course_names = self.__get_course_names()
self.courses = self.__get_available_courses()
self._populate_courses_data()
with open(".courses.json", "w") as f:
data = {course.course_text: course.course_url for course in self.courses}
json.dump(data, f, indent=4)
def __get_cached_courses(self):
with open(".courses.json", "r") as f:
courses_data = json.load(f)
courses = []
for course_text in courses_data:
link = courses_data[course_text]
course = Course(course_url=link)
course.set_course_text(course_text)
courses.append(course)
self.courses = courses
self.course_names = list(courses_data.keys())
self.courses_links = list(courses_data.values())
self._populate_courses_data()
def __scrap_files(self):
for course in self.courses:
course.get_course_files(os.path.join(DOWNLOADS_DIR, course.__str__()))
def _populate_courses_data(self):
# populate courses with names
for course, course_text in zip(self.courses, self.course_names):
course.set_course_text(course_text)
# populate courses soups
self.get_courses_soup()
def authenticate(self) -> None:
"""
Authenticate with GUC CMS.
"""
response = self.session.get(HOST, **self.get_args)
if response.status_code != 200:
raise CMSAuthenticationError("Authentication failed.")
def __get_available_courses(self) -> List[Course]:
"""
Get list of courses.
"""
self.courses_links = [link.get("href") for link in self.home_soup.find_all("a") if link.get("href")]
self.courses_links = [
HOST + link for link in self.courses_links if re.match(r"\/apps\/student\/CourseViewStn\?id(.*)", link)
]
return [Course(link) for link in self.courses_links]
def __get_course_names(self) -> List[str]:
"get course names"
courses_table = list(
self.home_soup.find(
"table",
{"id": "ContentPlaceHolderright_ContentPlaceHoldercontent_GridViewcourses"},
)
)
return [
re.sub(
Course.get_course_regex(),
r"\1-\2",
courses_table[i].text.strip(),
).strip()
for i in range(2, len(courses_table) - 1)
]
def get_courses_soup(self) -> None:
"""
Get courses page.
"""
for course in self.courses:
course.set_course_soup(
BeautifulSoup(
self.session.get(course.course_url, **self.get_args).text,
self.html_parser,
)
)
def __download_file(self, file: CMSFile) -> None:
response = self.session.get(file.url, **self.get_args, stream=True, allow_redirects=True)
if response.status_code != 200:
raise CMSAuthenticationError("Authentication failed.")
total_size = int(response.headers.get("Content-Length"))
with open(file.path, "wb") as f:
with tqdm(
total=total_size,
unit="B",
unit_scale=True,
desc=file.path,
initial=0,
dynamic_ncols=True,
colour=random.choice(TQDM_COLORS),
) as t:
for chunk in response.iter_content(chunk_size=1024):
f.write(chunk)
t.update(len(chunk))
@property
def files(self) -> List[CMSFile]:
"""
Get all files.
"""
return [file for course in self.courses for file in course.files if not os.path.exists(file.path)]


Step 2: 🧐 Snippet Analysis

From looking through the relevant snippets, I decided to make the following modifications:

File Path Proposed Changes
src/scraper.py In the Scraper class, import Iterfzf. In the __scrap_courses method, use Iterfzf to present the user with a list of course names to select from. Store the selected course names in a list. In the __scrap_files method, for each selected course, use Iterfzf to present the user with a list of file names to select from. Store the selected file names in a list.
src/course.py In the Course and CMSFile classes, modify the str method to return a string representation of the course or file that includes the course or file name.

Step 3: 📝 Planning

I have created a plan for writing the pull request. I am now working my plan and coding the required changes to address this issue. Here is the planned pull request:

Implement Iterfzf for course and file selection
sweep/iterfzf-selection

Description

This PR implements the use of Iterfzf library for interactive course and file selection in the Scraper class. It modifies the __scrap_courses and __scrap_files methods to present the user with an interactive selection interface using Iterfzf. Additionally, it modifies the str method of the Course and CMSFile classes to return a string representation of the course or file that can be displayed in the Iterfzf interface.

Summary of Changes

  • Added Iterfzf library as a dependency in the pyproject.toml file.
  • Modified the Scraper class in the scraper.py file to use Iterfzf for course and file selection.
  • In the __scrap_courses method, used Iterfzf to present the user with a list of course names to select from and stored the selected course names in a list.
  • In the __scrap_files method, for each selected course, used Iterfzf to present the user with a list of file names to select from and stored the selected file names in a list.
  • Modified the str method of the Course and CMSFile classes in the course.py file to return a string representation of the course or file that includes the course or file name.

Step 4: ⌨️ Coding

File Instructions Progress
src/scraper.py In the Scraper class, import Iterfzf. In the __scrap_courses method, use Iterfzf to present the user with a list of course names to select from. Store the selected course names in a list. In the __scrap_files method, for each selected course, use Iterfzf to present the user with a list of file names to select from. Store the selected file names in a list. ✅ Commit d45c286
src/course.py In the Course and CMSFile classes, modify the str method to return a string representation of the course or file that includes the course or file name. ✅ Commit d45c286

Step 5: 🔁 Code Review

Here are the my self-reviews of my changes at sweep/iterfzf-selection.

Here is the 1st review

No changes required. The code changes in both src/course.py and src/scraper.py are correct and do not contain any errors or unimplemented sections. The addition of the iterfzf module in src/scraper.py enhances the functionality of the __scrap_courses and __scrap_files methods by allowing the user to select which courses and files they are interested in. Good job!

I finished incorporating these changes.


To recreate the pull request, leave a comment prefixed with "sweep:" or edit the issue.
Join Our Discord

from cms-downloader-refined.

Related Issues (9)

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. 📊📈🎉

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google ❤️ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.