爬虫——爬取文件

爬取文件

需求分析：考研买的书本找到对应的网课资源，需要自动化一键下载课程资源，避免手动一个个f12找到对应网络资源后，又一个个点击下载，重命名等繁琐的操作。

import os
import time
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options

class RedirectCourseCrawler:
    def __init__(self, login_url, target_url, username=None, password=None):
        chrome_options = Options()
        chrome_options.add_argument('--ignore-ssl-errors=yes')
        chrome_options.add_argument('--ignore-certificate-errors')
        
        self.driver = webdriver.Chrome(options=chrome_options)
        self.login_url = login_url
        self.actual_target_url = target_url
        self.username = username
        self.password = password
        self.wait = WebDriverWait(self.driver, 20)

    def _wait_for_redirect(self, expected_url_part, timeout=30):
        start_time = time.time()
        while time.time() - start_time < timeout:
            if expected_url_part in self.driver.current_url:
                return True
            time.sleep(2)
        raise TimeoutError(f"未能在{timeout}秒内重定向到包含'{expected_url_part}'的页面")

    def login(self):
        self.driver.get(self.login_url)
        
        if self.username and self.password:
            try:
                username_elem = self.wait.until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, 'input[type="text"]'))
                )
                password_elem = self.driver.find_element(By.CSS_SELECTOR, 'input[type="password"]')
                
                username_elem.send_keys(self.username)
                password_elem.send_keys(self.password)
                
                submit_btn = self.driver.find_element(By.CSS_SELECTOR, 'button.login-btn')
                submit_btn.click()
                print("已提交登录表单")
            except Exception as e:
                print(f"自动登录失败: {e}")
                input("请手动登录后按回车继续...")
        else:
            input("请手动完成登录后按回车继续...")

        try:
            self._wait_for_redirect("/home/")
            print("登录成功，当前URL:", self.driver.current_url)
        except TimeoutError:
            print("警告：可能未正确跳转到首页")

        print("等待安全时间(60秒)...")
        time.sleep(60)

    def navigate_to_target(self):
        print(f"正在访问目标页面: {self.actual_target_url}")
        self.driver.get(self.actual_target_url)
        
        try:
            self.wait.until(
                EC.presence_of_element_located((By.CLASS_NAME, "chapter-title-text"))
            )
            print("目标页面加载成功")
        except Exception as e:
            print("目标页面加载异常:", e)
            self.driver.save_screenshot('page_load_error.png')

    def expand_chapters(self):
        chapters = self.driver.find_elements(By.CLASS_NAME, "chapter-title-text")
        print(f"找到{len(chapters)}个章节")
        
        for idx, chapter in enumerate(chapters, 1):
            try:
                self.driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'center'});", chapter)
                self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME, "chapter-title-text")))
                chapter.click()
                print(f"已展开第 {idx} 章")
                time.sleep(1)
            except Exception as e:
                print(f"展开第 {idx} 章失败: {e}")

    def get_file_links(self):
        valid_domains = ['icourses.cn', 'resdoc.icourses.cn']
        elements = self.driver.find_elements(By.CSS_SELECTOR, '[data-class="media"]')
        
        file_links = []
        for elem in elements:
            try:
                file_type = elem.get_attribute("data-type")
                title = elem.get_attribute("data-title")[:50]
                url = elem.get_attribute("data-url") or elem.get_attribute("data-ppturl")
                
                if not url:
                    continue
                if not any(domain in url for domain in valid_domains):
                    print(f"跳过非信任域名资源: {url}")
                    continue
                
                file_links.append({
                    "type": file_type,
                    "title": title.strip(),
                    "url": url
                })
            except Exception as e:
                print("解析文件元素时出错:", e)
        
        return file_links

    def download_files(self, file_links):
        os.makedirs("downloads", exist_ok=True)
        
        for file in file_links:
            try:
                # 生成安全基础文件名
                safe_name = "".join(c if c.isalnum() or c in (' ', '-', '_') else '_' for c in file['title'])
                base_name = safe_name.strip()
                extension = file['type']
                
                # 生成唯一文件名
                counter = 0
                while True:
                    if counter == 0:
                        current_filename = f"{base_name}.{extension}"
                    else:
                        current_filename = f"{base_name}_{counter}.{extension}"
                    filename = os.path.join("downloads", current_filename)
                    if not os.path.exists(filename):
                        break
                    counter += 1
                
                # 下载文件
                headers = {
                    "Referer": self.actual_target_url,
                    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
                }
                
                with requests.get(file['url'], headers=headers, stream=True, timeout=20) as r:
                    r.raise_for_status()
                    with open(filename, 'wb') as f:
                        for chunk in r.iter_content(chunk_size=8192):
                            if chunk:
                                f.write(chunk)
                print(f"下载完成: {filename}")
            except requests.exceptions.Timeout:
                print(f"下载超时: {file['title']}")
            except Exception as e:
                print(f"下载失败 {file['title']}: {str(e)[:100]}")

    def run(self):
        try:
            self.login()
            self.navigate_to_target()
            self.expand_chapters()
            files = self.get_file_links()
            print(f"准备下载{len(files)}个文件")
            self.download_files(files)
        except Exception as e:
            print("主流程异常:", e)
            self.driver.save_screenshot('final_error.png')
        finally:
            self.driver.quit()
            print("浏览器已关闭")

if __name__ == "__main__":
    config = {
        "login_url": "https://www.icourses.cn/login",
        "target_url": "https://www.icourses.cn/web/sword/portal/shareDetails?cId=5884#/course/chapter",
        "username": None,
        "password": None
    }

    crawler = RedirectCourseCrawler(**config)
    crawler.run()

本代码以访问**计算机组成原理(袁春风)**这本书的网络资源为例子。

对应平台登录地址为: https://www.icourses.cn/home

对用资源地址为https://www.icourses.cn/web/sword/portal/shareDetails?cId=5884#/course/chapter

ppt结尾更改后缀

由于下载资源显示data-url显示为ppt格式，但实际资源地址为pdf格式，爬取所有文件后发现ppt文件损坏打不开，因此想要批量更改文件后缀名

import os
import glob
import sys

def rename_ppt_files(directory):
    """
    将目录中所有.ppt文件重命名为.ppt.pdf格式
    :param directory: 目标目录路径
    """
    # 获取所有.ppt文件（包括子目录）
    ppt_files = glob.glob(os.path.join(directory, '**', '*.ppt'), recursive=True)
    
    if not ppt_files:
        print(f"在目录 {directory} 中未找到.ppt文件")
        return

    renamed_count = 0
    error_count = 0

    for original_path in ppt_files:
        try:
            # 跳过已经是.ppt.pdf结尾的文件
            if original_path.endswith(".ppt.pdf"):
                continue

            # 构造新路径
            dir_name = os.path.dirname(original_path)
            base_name = os.path.basename(original_path)
            new_name = base_name + ".pdf"  # 添加额外扩展名
            new_path = os.path.join(dir_name, new_name)

            # 执行重命名
            os.rename(original_path, new_path)
            print(f"重命名成功: {base_name} -> {new_name}")
            renamed_count += 1

        except Exception as e:
            error_count += 1
            print(f"处理文件 {original_path} 时出错: {str(e)}")

    # 输出统计信息
    print("\n操作完成，统计结果:")
    print(f"总处理文件数: {len(ppt_files)}")
    print(f"成功重命名: {renamed_count}")
    print(f"失败次数: {error_count}")

if __name__ == "__main__":
    # 设置目标目录（默认为当前目录）
    target_dir = input("请输入目标目录路径（直接回车使用当前目录）: ").strip()
    
    if not target_dir:
        target_dir = os.getcwd()
    
    if not os.path.isdir(target_dir):
        print(f"错误: 目录 {target_dir} 不存在")
        sys.exit(1)

    # 确认操作
    confirm = input(f"即将处理目录: {target_dir}\n是否继续？(y/n): ").lower()
    if confirm != 'y':
        print("操作已取消")
        sys.exit(0)

    rename_ppt_files(target_dir)