web表格导出(油猴脚本版) #excel
说明
补一下之前的坑。可以方便的从任意网站提取表格为excel的格式。原py版
发布地址 https://greasyfork.org/zh-CN/scripts/529375-web%E8%A1%A8%E6%A0%BC%E5%AF%BC%E5%87%BA%E5%8A%A9%E6%89%8B
说明
补一下之前的坑。可以方便的从任意网站提取表格为excel的格式。原py版
发布地址 https://greasyfork.org/zh-CN/scripts/529375-web%E8%A1%A8%E6%A0%BC%E5%AF%BC%E5%87%BA%E5%8A%A9%E6%89%8B
web表格 #Excel
import os
import subprocess
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import random
import string
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
def generate_random_string(length=6):
return ''.join(random.choices(string.ascii_letters + string.digits, k=length))
try:
subprocess.run(
["taskkill", "/f", "/im", "chrome.exe"],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
check=True
)
except subprocess.CalledProcessError:
pass
user_data_dir = os.path.join(os.environ['LOCALAPPDATA'], r"Google\Chrome\User Data")
options = webdriver.ChromeOptions()
options.add_argument(f"user-data-dir={user_data_dir}")
service = Service(executable_path=r'C:\Windows\chromedriver.exe')
driver = webdriver.Chrome(options=options, service=service)
# 打开一个网页
driver.get("https://baidu.com")
while True:
try:
# 提示用户按回车开始读取网页内容
input("请访问web表格所在页面,回车开始执行:")
div_html = driver.page_source
# 使用 BeautifulSoup 解析 HTML
soup = BeautifulSoup(div_html, 'html.parser')
# 获取 title 标签内容或生成随机字符串
title_tag = soup.title.string if soup.title else generate_random_string()
current_date = datetime.now().strftime("%m%d")
# 找到所有表格
tables = soup.find_all('table')
num_tables = len(tables)
if num_tables == 0:
print("未找到表格,请检查页面内容。")
continue
print(f"*****识别到 {num_tables} 个表格*****")
dataframes = []
for i, table in enumerate(tables):
rows = []
for row in table.find_all('tr'):
cols = row.find_all(['td', 'th'])
cols = [ele.text.strip() for ele in cols]
rows.append(cols)
# 使用 pandas 将数据转换为 DataFrame
df = pd.DataFrame(rows[1:], columns=rows[0])
dataframes.append(df)
print(f"\n======================预览数据 (表格 {i + 1})===========================")
print(df.head())
# 用户选择需要输出的表格序号
selection = input(f"\n请输入要输出的表格序号 (1-{num_tables}) 或 'a' 输出所有表格: ").strip().lower()
if selection == 'a':
for i, df in enumerate(dataframes):
excel_path = f'{title_tag}_{current_date}_table{i + 1}.xlsx'
df.to_excel(excel_path, index=False)
print(f"数据已保存到 {excel_path}")
else:
try:
selected_table = int(selection) - 1
if 0 <= selected_table < num_tables:
excel_path = f'{title_tag}_{current_date}_table{selected_table + 1}.xlsx'
dataframes[selected_table].to_excel(excel_path, index=False)
print(f"数据已保存到 {excel_path}")
else:
print("无效的table序号。")
except ValueError:
print("请输入有效的序号")
except Exception as e:
if 'target window already closed' in str(e):
print('监听窗口已关闭..')
exit()
print(f"发生错误: {e}")
# 检查是否继续
cont = input('继续处理?(y/其他退出): ')
if cont.strip().lower() != 'y':
driver.quit()
break