web表格 #Excel

import os
import subprocess

from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import random
import string
from selenium import webdriver
from selenium.webdriver.chrome.service import Service


def generate_random_string(length=6):
    return ''.join(random.choices(string.ascii_letters + string.digits, k=length))

try:
    subprocess.run(
        ["taskkill", "/f", "/im", "chrome.exe"],
        stdout=subprocess.DEVNULL,
        stderr=subprocess.DEVNULL,
        check=True
    )
except subprocess.CalledProcessError:
    pass

user_data_dir = os.path.join(os.environ['LOCALAPPDATA'], r"Google\Chrome\User Data")

options = webdriver.ChromeOptions()
options.add_argument(f"user-data-dir={user_data_dir}")
service = Service(executable_path=r'C:\Windows\chromedriver.exe')

driver = webdriver.Chrome(options=options, service=service)

# 打开一个网页
driver.get("https://baidu.com")

while True:
    try:
        # 提示用户按回车开始读取网页内容
        input("请访问web表格所在页面,回车开始执行:")
        div_html = driver.page_source

        # 使用 BeautifulSoup 解析 HTML
        soup = BeautifulSoup(div_html, 'html.parser')

        # 获取 title 标签内容或生成随机字符串
        title_tag = soup.title.string if soup.title else generate_random_string()
        current_date = datetime.now().strftime("%m%d")

        # 找到所有表格
        tables = soup.find_all('table')

        num_tables = len(tables)
        if num_tables == 0:
            print("未找到表格,请检查页面内容。")
            continue

        print(f"*****识别到 {num_tables} 个表格*****")

        dataframes = []
        for i, table in enumerate(tables):
            rows = []
            for row in table.find_all('tr'):
                cols = row.find_all(['td', 'th'])
                cols = [ele.text.strip() for ele in cols]
                rows.append(cols)

            # 使用 pandas 将数据转换为 DataFrame
            df = pd.DataFrame(rows[1:], columns=rows[0])
            dataframes.append(df)

            print(f"\n======================预览数据 (表格 {i + 1})===========================")
            print(df.head())

        # 用户选择需要输出的表格序号
        selection = input(f"\n请输入要输出的表格序号 (1-{num_tables}) 或 'a' 输出所有表格: ").strip().lower()

        if selection == 'a':
            for i, df in enumerate(dataframes):
                excel_path = f'{title_tag}_{current_date}_table{i + 1}.xlsx'
                df.to_excel(excel_path, index=False)
                print(f"数据已保存到 {excel_path}")
        else:
            try:
                selected_table = int(selection) - 1
                if 0 <= selected_table < num_tables:
                    excel_path = f'{title_tag}_{current_date}_table{selected_table + 1}.xlsx'
                    dataframes[selected_table].to_excel(excel_path, index=False)
                    print(f"数据已保存到 {excel_path}")
                else:
                    print("无效的table序号。")
            except ValueError:
                print("请输入有效的序号")

    except Exception as e:
        if 'target window already closed' in str(e):
            print('监听窗口已关闭..')
            exit()
        print(f"发生错误: {e}")

    # 检查是否继续
    cont = input('继续处理?(y/其他退出): ')
    if cont.strip().lower() != 'y':
        driver.quit()
        break
 
 
Back to Top