sweep: Use Iterfzf to select courses and files about cms-downloader-refined HOT 1 OPEN

aboueleyes commented on September 25, 2024 1

sweep: Use Iterfzf to select courses and files

from cms-downloader-refined.

Comments (1)

sweep-ai commented on September 25, 2024

Here's the PR! #10.

⚡ Sweep Free Trial: I used GPT-3.5 to create this ticket. You have 3 GPT-4 tickets left for the month and 0 for the day. For more GPT-4 tickets, visit our payment portal.To get Sweep to recreate this ticket, leave a comment prefixed with "sweep:" or edit the issue.

Install Sweep Configs: Pull Request

Step 1: 🔍 Code Search

I found the following snippets in your repository. I will now analyze these snippets and come up with a plan.

Some code snippets I looked at (click to expand). If some file is missing from here, you can mention the path in the ticket description.

cms-downloader-refined/src/course.py

Lines 1 to 95 in dfeaf48

    
           import os 
        
           import re 
        
           from datetime import datetime 
        
           from bs4 import BeautifulSoup 
        
           from sanitize_filename import sanitize 
        
           class Course: 
        
               """ 
        
               Class for storing course information. 
        
               """ 
        
               def __init__(self, course_url: str) -> None: 
        
                   self.course_url = course_url 
        
                   self.id = self.course_url.split("id")[1][1:].split("&")[0] 
        
                   self.files = [] 
        
               def __str__(self) -> str: 
        
                   return f"[{self.course_code}] {self.course_name}" 
        
               __repr__ = __str__ 
        
               @staticmethod 
        
               def get_course_regex() -> re.Pattern: 
        
                   return re.compile(r"\n*[\(][\|]([^\|]*)[\|][\)]([^\(]*)[\(].*\n*") 
        
               @property 
        
               def course_code(self) -> None: 
        
                   return self.course_text.split("-")[0].strip() 
        
               @property 
        
               def course_name(self) -> None: 
        
                   return self.course_text.split("-")[1].strip() 
        
               def set_course_text(self, course_text: str) -> None: 
        
                   """ 
        
                   Set the course text. (e.g. "CS 201 - Programming 1") 
        
                   course code + course name 
        
                   :param course_text: The course text. 
        
                   """ 
        
                   self.course_text = course_text 
        
               def create_course_directory(self) -> None: 
        
                   for file in self.files: 
        
                       os.makedirs(os.path.join(file.dir_path), exist_ok=True) 
        
               def set_course_soup(self, course_soup: BeautifulSoup) -> None: 
        
                   self.course_soup = course_soup 
        
               def get_course_files(self, course_path) -> None: 
        
                   """ 
        
                   Get the list of files in the course. 
        
                   """ 
        
                   files_body = self.course_soup.find_all(class_="card-body") 
        
                   for item in files_body: 
        
                       # check if the card is not a course content, useful for `Filter weeks` card 
        
                       if item.find('strong') is None: 
        
                           continue 
        
                       self.files.append((CMSFile(soup=item, course_path=course_path))) 
        
           class CMSFile: 
        
               """a cms file object""" 
        
               def __init__(self, soup: BeautifulSoup, course_path) -> None: 
        
                   from scraper import HOST 
        
                   self.soup = soup 
        
                   self.url = HOST + self.soup.find("a")["href"] 
        
                   self.week = self.soup.parent.parent.parent.parent.find("h2").text.strip() 
        
                   self.week = re.sub(r"Week: (.*)", "\\1", self.week) 
        
                   self.week = datetime.strptime(self.week, "%Y-%m-%d").strftime("W %m-%d") 
        
                   self.description = re.sub(self.get_file_regex(), "\\1", self.soup.find("div").text).strip() 
        
                   self.name = re.sub(self.get_file_regex(), "\\1", self.soup.find("strong").text).strip() 
        
                   self.name = sanitize(self.name) 
        
                   self.extension = self.url.rsplit(".", 1)[1] 
        
                   self.dir_path = os.path.join(course_path, self.week) 
        
                   self.path = os.path.join(self.dir_path, f"{self.name}.{self.extension}") 
        
               @staticmethod 
        
               def get_file_regex() -> re.Pattern: 
        
                   return re.compile(r"[0-9]* - (.*)") 
        
               def __str__(self) -> str: 
        
                   return f"{self.name}" 
        
               __repr__ = __str__

cms-downloader-refined/src/scraper.py

Lines 1 to 204 in dfeaf48

    
           import threading 
        
           import os 
        
           import random 
        
           import re 
        
           import json 
        
           from typing import Dict, List 
        
           import requests 
        
           import yaml 
        
           from bs4 import BeautifulSoup 
        
           from course import CMSFile, Course 
        
           from requests_ntlm import HttpNtlmAuth 
        
           from tqdm import tqdm 
        
           from auth import Credentials, CMSAuthenticationError 
        
           YML_FILE = "config.yml" 
        
           YML_CONFIG = yaml.safe_load(open(YML_FILE)) 
        
           HOST = YML_CONFIG["host"] 
        
           DOWNLOADS_DIR = YML_CONFIG["downloads_dir"] 
        
           TQDM_COLORS = [ 
        
               "#ff0000", 
        
               "#00ff00", 
        
               "#0000ff", 
        
               "#ffff00", 
        
               "#00ffff", 
        
               "#ff00ff", 
        
               "#ffffff", 
        
               "#000000", 
        
           ] 
        
           class Scraper: 
        
               """ 
        
               Class for scraping data from GUC CMS. 
        
               """ 
        
               def __init__(self, credentials: Credentials): 
        
                   self.credentials: Credentials = credentials 
        
                   self.session: requests.Session = requests.Session() 
        
                   self.session.auth = HttpNtlmAuth(credentials.username, credentials.password) 
        
                   self.session.headers.update({"User-Agent": "Mozilla/5.0"}) 
        
                   self.html_parser: str = "html.parser" 
        
                   self.get_args: Dict[str, object] = { 
        
                       "auth": self.session.auth, 
        
                       "verify": False, 
        
                   } 
        
               @property 
        
               def home_soup(self) -> BeautifulSoup: 
        
                   """ 
        
                   Get home page. 
        
                   """ 
        
                   return BeautifulSoup(self.session.get(HOST, **self.get_args).text, self.html_parser) 
        
               def run(self) -> None: 
        
                   """ 
        
                   Run the scraper. 
        
                   """ 
        
                   # authenticate 
        
                   try: 
        
                       self.authenticate() 
        
                   except CMSAuthenticationError: 
        
                       self.credentials.remove_credentials() 
        
                       return self.run() 
        
                   self.__scrap_courses() 
        
                   self.__scrap_files() 
        
                   self.__create_courses_dir() 
        
                   self.__download_all_files() 
        
               def __download_all_files(self): 
        
                   # download files in parallel using threads 
        
                   threads = [] 
        
                   for file in self.files: 
        
                       thread = threading.Thread(target=self.__download_file, args=(file,)) 
        
                       thread.start() 
        
                       threads.append(thread) 
        
               def __create_courses_dir(self): 
        
                   for course in self.courses: 
        
                       course.create_course_directory() 
        
               def __scrap_courses(self) -> None: 
        
                   # cache the courses name and links 
        
                   if os.path.exists(".courses.json"): 
        
                       self.__get_cached_courses() 
        
                   else: 
        
                       self.__cache_courses() 
        
               def __cache_courses(self): 
        
                   self.course_names = self.__get_course_names() 
        
                   self.courses = self.__get_available_courses() 
        
                   self._populate_courses_data() 
        
                   with open(".courses.json", "w") as f: 
        
                       data = {course.course_text: course.course_url for course in self.courses} 
        
                       json.dump(data, f, indent=4) 
        
               def __get_cached_courses(self): 
        
                   with open(".courses.json", "r") as f: 
        
                       courses_data = json.load(f) 
        
                       courses = [] 
        
                       for course_text in courses_data: 
        
                           link = courses_data[course_text] 
        
                           course = Course(course_url=link) 
        
                           course.set_course_text(course_text) 
        
                           courses.append(course) 
        
                       self.courses = courses 
        
                       self.course_names = list(courses_data.keys()) 
        
                       self.courses_links = list(courses_data.values()) 
        
                       self._populate_courses_data() 
        
               def __scrap_files(self): 
        
                   for course in self.courses: 
        
                       course.get_course_files(os.path.join(DOWNLOADS_DIR, course.__str__())) 
        
               def _populate_courses_data(self): 
        
                   # populate courses with names 
        
                   for course, course_text in zip(self.courses, self.course_names): 
        
                       course.set_course_text(course_text) 
        
                   # populate courses soups 
        
                   self.get_courses_soup() 
        
               def authenticate(self) -> None: 
        
                   """ 
        
                   Authenticate with GUC CMS. 
        
                   """ 
        
                   response = self.session.get(HOST, **self.get_args) 
        
                   if response.status_code != 200: 
        
                       raise CMSAuthenticationError("Authentication failed.") 
        
               def __get_available_courses(self) -> List[Course]: 
        
                   """ 
        
                   Get list of courses. 
        
                   """ 
        
                   self.courses_links = [link.get("href") for link in self.home_soup.find_all("a") if link.get("href")] 
        
                   self.courses_links = [ 
        
                       HOST + link for link in self.courses_links if re.match(r"\/apps\/student\/CourseViewStn\?id(.*)", link) 
        
                   ] 
        
                   return [Course(link) for link in self.courses_links] 
        
               def __get_course_names(self) -> List[str]: 
        
                   "get course names" 
        
                   courses_table = list( 
        
                       self.home_soup.find( 
        
                           "table", 
        
                           {"id": "ContentPlaceHolderright_ContentPlaceHoldercontent_GridViewcourses"}, 
        
                       ) 
        
                   ) 
        
                   return [ 
        
                       re.sub( 
        
                           Course.get_course_regex(), 
        
                           r"\1-\2", 
        
                           courses_table[i].text.strip(), 
        
                       ).strip() 
        
                       for i in range(2, len(courses_table) - 1) 
        
                   ] 
        
               def get_courses_soup(self) -> None: 
        
                   """ 
        
                   Get courses page. 
        
                   """ 
        
                   for course in self.courses: 
        
                       course.set_course_soup( 
        
                           BeautifulSoup( 
        
                               self.session.get(course.course_url, **self.get_args).text, 
        
                               self.html_parser, 
        
                           ) 
        
                       ) 
        
               def __download_file(self, file: CMSFile) -> None: 
        
                   response = self.session.get(file.url, **self.get_args, stream=True, allow_redirects=True) 
        
                   if response.status_code != 200: 
        
                       raise CMSAuthenticationError("Authentication failed.") 
        
                   total_size = int(response.headers.get("Content-Length")) 
        
                   with open(file.path, "wb") as f: 
        
                       with tqdm( 
        
                           total=total_size, 
        
                           unit="B", 
        
                           unit_scale=True, 
        
                           desc=file.path, 
        
                           initial=0, 
        
                           dynamic_ncols=True, 
        
                           colour=random.choice(TQDM_COLORS), 
        
                       ) as t: 
        
                           for chunk in response.iter_content(chunk_size=1024): 
        
                               f.write(chunk) 
        
                               t.update(len(chunk)) 
        
               @property 
        
               def files(self) -> List[CMSFile]: 
        
                   """ 
        
                   Get all files. 
        
                   """ 
        
                   return [file for course in self.courses for file in course.files if not os.path.exists(file.path)]

Step 2: 🧐 Snippet Analysis

From looking through the relevant snippets, I decided to make the following modifications:

File Path	Proposed Changes
`src/scraper.py`	In the Scraper class, import Iterfzf. In the __scrap_courses method, use Iterfzf to present the user with a list of course names to select from. Store the selected course names in a list. In the __scrap_files method, for each selected course, use Iterfzf to present the user with a list of file names to select from. Store the selected file names in a list.
`src/course.py`	In the Course and CMSFile classes, modify the str method to return a string representation of the course or file that includes the course or file name.

Step 3: 📝 Planning

I have created a plan for writing the pull request. I am now working my plan and coding the required changes to address this issue. Here is the planned pull request:

Implement Iterfzf for course and file selection
sweep/iterfzf-selection

Description

This PR implements the use of Iterfzf library for interactive course and file selection in the Scraper class. It modifies the __scrap_courses and __scrap_files methods to present the user with an interactive selection interface using Iterfzf. Additionally, it modifies the str method of the Course and CMSFile classes to return a string representation of the course or file that can be displayed in the Iterfzf interface.

Summary of Changes

Added Iterfzf library as a dependency in the pyproject.toml file.

Modified the Scraper class in the scraper.py file to use Iterfzf for course and file selection.

In the __scrap_courses method, used Iterfzf to present the user with a list of course names to select from and stored the selected course names in a list.

In the __scrap_files method, for each selected course, used Iterfzf to present the user with a list of file names to select from and stored the selected file names in a list.

Modified the str method of the Course and CMSFile classes in the course.py file to return a string representation of the course or file that includes the course or file name.

Step 4: ⌨️ Coding

File	Instructions	Progress
`src/scraper.py`	In the Scraper class, import Iterfzf. In the __scrap_courses method, use Iterfzf to present the user with a list of course names to select from. Store the selected course names in a list. In the __scrap_files method, for each selected course, use Iterfzf to present the user with a list of file names to select from. Store the selected file names in a list.	✅ Commit `d45c286`
`src/course.py`	In the Course and CMSFile classes, modify the str method to return a string representation of the course or file that includes the course or file name.	✅ Commit `d45c286`

Step 5: 🔁 Code Review

Here are the my self-reviews of my changes at sweep/iterfzf-selection.

Here is the 1st review

No changes required. The code changes in both src/course.py and src/scraper.py are correct and do not contain any errors or unimplemented sections. The addition of the iterfzf module in src/scraper.py enhances the functionality of the __scrap_courses and __scrap_files methods by allowing the user to select which courses and files they are interested in. Good job!

I finished incorporating these changes.

To recreate the pull request, leave a comment prefixed with "sweep:" or edit the issue.
^{Join Our Discord}

from cms-downloader-refined.

sweep: Use Iterfzf to select courses and files about cms-downloader-refined HOT 1 OPEN

Comments (1)

Here's the PR! #10.

Step 1: 🔍 Code Search

Step 2: 🧐 Snippet Analysis

Step 3: 📝 Planning

Description

Summary of Changes

Step 4: ⌨️ Coding

Step 5: 🔁 Code Review

Related Issues (9)

Recommend Projects

React

Vue.js

Typescript

TensorFlow

Django

Laravel

D3

Recommend Topics

javascript

web

server

Machine learning

Visualization

Game

Recommend Org

Facebook

Microsoft

Google

Alibaba

D3

Tencent

	import os
	import re
	from datetime import datetime

	from bs4 import BeautifulSoup
	from sanitize_filename import sanitize


	class Course:
	"""
	Class for storing course information.
	"""

	def __init__(self, course_url: str) -> None:
	self.course_url = course_url
	self.id = self.course_url.split("id")[1][1:].split("&")[0]
	self.files = []

	def __str__(self) -> str:
	return f"[{self.course_code}] {self.course_name}"

	__repr__ = __str__

	@staticmethod
	def get_course_regex() -> re.Pattern:
	return re.compile(r"\n[\(][\\|]([^\\|])[\\|][\)]([^\(])[\(].\n*")

	@property
	def course_code(self) -> None:
	return self.course_text.split("-")[0].strip()

	@property
	def course_name(self) -> None:
	return self.course_text.split("-")[1].strip()

	def set_course_text(self, course_text: str) -> None:
	"""
	Set the course text. (e.g. "CS 201 - Programming 1")
	course code + course name

	:param course_text: The course text.
	"""
	self.course_text = course_text

	def create_course_directory(self) -> None:
	for file in self.files:
	os.makedirs(os.path.join(file.dir_path), exist_ok=True)

	def set_course_soup(self, course_soup: BeautifulSoup) -> None:
	self.course_soup = course_soup

	def get_course_files(self, course_path) -> None:
	"""
	Get the list of files in the course.
	"""
	files_body = self.course_soup.find_all(class_="card-body")

	for item in files_body:
	# check if the card is not a course content, useful for `Filter weeks` card
	if item.find('strong') is None:
	continue
	self.files.append((CMSFile(soup=item, course_path=course_path)))


	class CMSFile:
	"""a cms file object"""

	def __init__(self, soup: BeautifulSoup, course_path) -> None:
	from scraper import HOST

	self.soup = soup

	self.url = HOST + self.soup.find("a")["href"]

	self.week = self.soup.parent.parent.parent.parent.find("h2").text.strip()
	self.week = re.sub(r"Week: (.*)", "\\1", self.week)
	self.week = datetime.strptime(self.week, "%Y-%m-%d").strftime("W %m-%d")

	self.description = re.sub(self.get_file_regex(), "\\1", self.soup.find("div").text).strip()

	self.name = re.sub(self.get_file_regex(), "\\1", self.soup.find("strong").text).strip()
	self.name = sanitize(self.name)

	self.extension = self.url.rsplit(".", 1)[1]
	self.dir_path = os.path.join(course_path, self.week)
	self.path = os.path.join(self.dir_path, f"{self.name}.{self.extension}")

	@staticmethod
	def get_file_regex() -> re.Pattern:
	return re.compile(r"[0-9]* - (.*)")

	def __str__(self) -> str:
	return f"{self.name}"

	__repr__ = __str__

	import threading
	import os
	import random
	import re
	import json
	from typing import Dict, List

	import requests
	import yaml
	from bs4 import BeautifulSoup
	from course import CMSFile, Course
	from requests_ntlm import HttpNtlmAuth
	from tqdm import tqdm

	from auth import Credentials, CMSAuthenticationError

	YML_FILE = "config.yml"
	YML_CONFIG = yaml.safe_load(open(YML_FILE))

	HOST = YML_CONFIG["host"]
	DOWNLOADS_DIR = YML_CONFIG["downloads_dir"]

	TQDM_COLORS = [
	"#ff0000",
	"#00ff00",
	"#0000ff",
	"#ffff00",
	"#00ffff",
	"#ff00ff",
	"#ffffff",
	"#000000",
	]


	class Scraper:
	"""
	Class for scraping data from GUC CMS.
	"""

	def __init__(self, credentials: Credentials):
	self.credentials: Credentials = credentials
	self.session: requests.Session = requests.Session()
	self.session.auth = HttpNtlmAuth(credentials.username, credentials.password)
	self.session.headers.update({"User-Agent": "Mozilla/5.0"})
	self.html_parser: str = "html.parser"
	self.get_args: Dict[str, object] = {
	"auth": self.session.auth,
	"verify": False,
	}

	@property
	def home_soup(self) -> BeautifulSoup:
	"""
	Get home page.
	"""
	return BeautifulSoup(self.session.get(HOST, **self.get_args).text, self.html_parser)

	def run(self) -> None:
	"""
	Run the scraper.
	"""

	# authenticate
	try:
	self.authenticate()
	except CMSAuthenticationError:
	self.credentials.remove_credentials()
	return self.run()

	self.__scrap_courses()

	self.__scrap_files()

	self.__create_courses_dir()

	self.__download_all_files()

	def __download_all_files(self):
	# download files in parallel using threads
	threads = []
	for file in self.files:
	thread = threading.Thread(target=self.__download_file, args=(file,))
	thread.start()
	threads.append(thread)

	def __create_courses_dir(self):
	for course in self.courses:
	course.create_course_directory()

	def __scrap_courses(self) -> None:
	# cache the courses name and links
	if os.path.exists(".courses.json"):
	self.__get_cached_courses()
	else:
	self.__cache_courses()

	def __cache_courses(self):
	self.course_names = self.__get_course_names()
	self.courses = self.__get_available_courses()
	self._populate_courses_data()
	with open(".courses.json", "w") as f:
	data = {course.course_text: course.course_url for course in self.courses}
	json.dump(data, f, indent=4)

	def __get_cached_courses(self):
	with open(".courses.json", "r") as f:
	courses_data = json.load(f)
	courses = []
	for course_text in courses_data:
	link = courses_data[course_text]
	course = Course(course_url=link)
	course.set_course_text(course_text)
	courses.append(course)
	self.courses = courses
	self.course_names = list(courses_data.keys())
	self.courses_links = list(courses_data.values())
	self._populate_courses_data()

	def __scrap_files(self):
	for course in self.courses:
	course.get_course_files(os.path.join(DOWNLOADS_DIR, course.__str__()))

	def _populate_courses_data(self):
	# populate courses with names
	for course, course_text in zip(self.courses, self.course_names):
	course.set_course_text(course_text)

	# populate courses soups
	self.get_courses_soup()

	def authenticate(self) -> None:
	"""
	Authenticate with GUC CMS.
	"""
	response = self.session.get(HOST, **self.get_args)
	if response.status_code != 200:
	raise CMSAuthenticationError("Authentication failed.")

	def __get_available_courses(self) -> List[Course]:
	"""
	Get list of courses.
	"""
	self.courses_links = [link.get("href") for link in self.home_soup.find_all("a") if link.get("href")]
	self.courses_links = [
	HOST + link for link in self.courses_links if re.match(r"\/apps\/student\/CourseViewStn\?id(.*)", link)
	]
	return [Course(link) for link in self.courses_links]

	def __get_course_names(self) -> List[str]:
	"get course names"
	courses_table = list(
	self.home_soup.find(
	"table",
	{"id": "ContentPlaceHolderright_ContentPlaceHoldercontent_GridViewcourses"},
	)
	)
	return [
	re.sub(
	Course.get_course_regex(),
	r"\1-\2",
	courses_table[i].text.strip(),
	).strip()
	for i in range(2, len(courses_table) - 1)
	]

	def get_courses_soup(self) -> None:
	"""
	Get courses page.
	"""
	for course in self.courses:
	course.set_course_soup(
	BeautifulSoup(
	self.session.get(course.course_url, **self.get_args).text,
	self.html_parser,
	)
	)

	def __download_file(self, file: CMSFile) -> None:
	response = self.session.get(file.url, **self.get_args, stream=True, allow_redirects=True)
	if response.status_code != 200:
	raise CMSAuthenticationError("Authentication failed.")

	total_size = int(response.headers.get("Content-Length"))

	with open(file.path, "wb") as f:
	with tqdm(
	total=total_size,
	unit="B",
	unit_scale=True,
	desc=file.path,
	initial=0,
	dynamic_ncols=True,
	colour=random.choice(TQDM_COLORS),
	) as t:
	for chunk in response.iter_content(chunk_size=1024):
	f.write(chunk)
	t.update(len(chunk))

	@property
	def files(self) -> List[CMSFile]:
	"""
	Get all files.
	"""
	return [file for course in self.courses for file in course.files if not os.path.exists(file.path)]