WeeklyReportViaAI/main.py

335 lines
12 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import requests
import datetime
import time
import os
import smtplib
from email.message import EmailMessage
from habanero import Crossref
import markdown as md
from weasyprint import HTML, CSS
# ================= 1. 配置中心 =================
FEISHU_APP_ID = "cli_a9d25c8530785cc8"
FEISHU_APP_SECRET = "5n00X2JtvKoeWmwcPVRKkcDgnoLMoNGb"
SPREADSHEET_TOKEN = "JyyEsR8tYh9Q2rt08v7cogWznJg"
SHEET_ID = "3105e6"
DS_API_KEY = "sk-a8e71892ed7f478eb60319c231f9c3c2"
DS_API_URL = "https://api.deepseek.com/chat/completions"
SERP_API_KEY = "e2778d6230fb7b81584e875344dd5cd38c8c5679f2b0dd4d1fd1cee7a1461a44"
MAIL_SENDER = "lmyleo7@163.com" # 请替换为你的163邮箱
MAIL_APP_PASSWORD = "VDcsViqCvGUHg3V4" # 请替换为你的163邮箱授权码
MAIL_RECIPIENT = "kwei@zju.edu.cn"
MAIL_SMTP_SERVER = "smtp.163.com"
MAIL_SMTP_PORT = 465
TARGET_KEYWORDS = [
"Silicon Photonics",
"Photonic Integrated Circuits",
"Nanophotonics",
"Metasurface",
"Metamaterials",
"Computational Imaging",
"Plasmonics",
"Optoelectronics",
"Diffractive Optics",
"Optical Interconnects",
"On-chip Photonics",
"Flat Optics"
]
# ================= 2. 核心功能函数 =================
def get_journal_configs():
"""[STEP 1] 从飞书读取期刊配置"""
print("\n--- [STEP 1] 正在读取飞书期刊配置 ---")
try:
auth_url = "https://open.feishu.cn/open-apis/auth/v3/tenant_access_token/internal"
token_res = requests.post(auth_url, json={"app_id": FEISHU_APP_ID, "app_secret": FEISHU_APP_SECRET}).json()
token = token_res.get("tenant_access_token")
read_url = f"https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/{SPREADSHEET_TOKEN}/values_batch_get"
headers = {"Authorization": f"Bearer {token}"}
res = requests.get(read_url, params={"ranges": f"{SHEET_ID}!A2:C50"}, headers=headers).json()
value_ranges = res.get("data", {}).get("valueRanges", [])
if not value_ranges: return []
rows = value_ranges[0].get("values", [])
configs = []
for r in rows:
if r and len(r) >= 1 and r[0] and str(r[0]).strip():
configs.append({
"name": str(r[0]).strip(),
"issn": str(r[1]).strip() if (len(r) > 1 and r[1]) else None,
"e_issn": str(r[2]).strip() if (len(r) > 2 and r[2]) else None
})
print(f"✅ 成功加载 {len(configs)} 条有效期刊配置")
return configs
except Exception as e:
print(f"❌ 飞书配置读取失败: {e}")
return []
def fetch_dois_crossref(journal_configs):
"""[STEP 2] 通过 Crossref 挨个关键词检索最近 7 天的论文"""
print(f"\n--- [STEP 2] 正在 Crossref 检索 (按关键词逐一扫描) ---")
cr = Crossref(mailto="hextorize@gmail.com")
start_date = (datetime.date.today() - datetime.timedelta(days=7)).strftime("%Y-%m-%d")
tasks = []
seen_dois = set() # 用于去重
for j in journal_configs:
print(f"\n >>> 正在扫描期刊: {j['name']}")
target_issn = j['e_issn'] or j['issn']
# 遍历每一个关键词进行搜索
for kw in TARGET_KEYWORDS:
print(f" - 搜索关键词: {kw}...", end="", flush=True)
filters = {'from-pub-date': start_date}
if target_issn:
filters['issn'] = target_issn
query_str = kw
else:
# 如果没有 ISSN强制在 query 中包含期刊名
query_str = f'"{j["name"]}" {kw}'
try:
# 每个关键词限制搜 1-2 篇,避免任务量过大
res = cr.works(filter=filters, query=query_str, limit=3, select="title,DOI")
items = res['message'].get('items', [])
new_found = 0
for art in items:
doi = art['DOI']
if doi not in seen_dois:
tasks.append({
"title_en": art['title'][0] if art.get('title') else "No Title",
"doi": doi,
"journal": j['name'],
"hit_keyword": kw # 记录是被哪个词命中的
})
seen_dois.add(doi)
new_found += 1
print(f" 新找到 {new_found}")
except Exception as e:
print(" 跳过 (请求异常)")
print(f"\n✅ 扫描结束,共锁定 {len(tasks)} 篇唯一论文")
return tasks
def enrich_content_serpapi(tasks):
"""[STEP 3] 利用 SerpApi 提取 Google Scholar 片段"""
print(f"\n--- [STEP 3] 正在通过 SerpApi 提取核心内容 ---")
for i, task in enumerate(tasks):
print(f" [{i + 1}/{len(tasks)}] 检索 DOI: {task['doi']}...", end="", flush=True)
try:
params = {
"engine": "google_scholar",
"q": f"DOI:{task['doi']}",
"api_key": SERP_API_KEY,
"hl": "en",
}
res = requests.get("https://serpapi.com/search.json", params=params, timeout=30)
res.raise_for_status()
results = res.json().get("organic_results", [])
task['snippet_en'] = results[0].get(
"snippet", "No abstract snippet available."
) if results else "No content found."
print(" [OK]")
except Exception as e:
task['snippet_en'] = f"Search Error: {e}"
print(" [Error]")
time.sleep(0.6) # 稍微延迟避免 API 限制
return tasks
def summarize_with_ds(tasks):
"""[STEP 4] 调用 DeepSeek 进行处理"""
print(f"\n--- [STEP 4] 正在请求 DeepSeek 处理内容 ---")
headers = {"Authorization": f"Bearer {DS_API_KEY}", "Content-Type": "application/json"}
for i, task in enumerate(tasks):
print(f" > 处理第 {i + 1} 篇...", end="", flush=True)
prompt = (f"你是一个光学专家。请翻译以下论文标题和摘要为中文并总结核心创新点200字内\n"
f"标题: {task['title_en']}\n摘要: {task['snippet_en']}\n"
f"格式要求:\n中文标题xxx\n中文摘要xxx\nAI总结xxx")
payload = {
"model": "deepseek-chat",
"messages": [{"role": "system", "content": "你是一个严谨的科研助手。"}, {"role": "user", "content": prompt}]
}
try:
res = requests.post(DS_API_URL, json=payload, headers=headers, timeout=30).json()
content = res['choices'][0]['message']['content'].strip()
# 解析 DeepSeek 返回的格式
for line in content.split('\n'):
if "中文标题:" in line: task['title_zh'] = line.split("", 1)[1].strip()
if "中文摘要:" in line: task['snippet_zh'] = line.split("", 1)[1].strip()
if "AI总结" in line: task['summary_zh'] = line.split("", 1)[1].strip()
print(" [完成]")
except:
print(" [失败]")
return tasks
def save_to_markdown(tasks):
"""[STEP 5] 保存为 Markdown 报告"""
print(f"\n--- [STEP 5] 正在生成 Markdown 报告 ---")
today_str = datetime.date.today().strftime("%Y-%m-%d")
file_name = f"{today_str}周报.md"
with open(file_name, "w", encoding="utf-8") as f:
f.write(f"# 光学行业学术动态周报 ({today_str})\n\n")
f.write("## 目录\n")
for idx, task in enumerate(tasks, start=1):
title = task.get('title_zh', 'N/A')
anchor = f"task-{idx}"
f.write(f"- [{idx}. {title}](#{anchor})\n")
f.write("\n---\n\n")
for idx, task in enumerate(tasks):
f.write(f"### {idx + 1}. {task.get('title_zh', 'N/A')}\n\n")
f.write(f"**期刊**: {task['journal']} | **匹配关键词**: {task.get('hit_keyword', 'N/A')} \n")
f.write(f"**DOI**: [{task['doi']}](https://doi.org/{task['doi']})\n\n")
f.write(f"#### 【标题】\n")
f.write(f"- **中文**: {task.get('title_zh', 'N/A')}\n")
f.write(f"- **英文**: {task['title_en']}\n\n")
f.write(f"#### 【摘要原文】\n")
f.write(f"> *{task.get('snippet_en', 'N/A')}*\n\n")
f.write(f"#### 【摘要翻译】\n")
f.write(f"> {task.get('snippet_zh', 'N/A')}\n\n")
f.write(f"#### 【AI 深度总结】\n")
f.write(f"{task.get('summary_zh', 'N/A')}\n\n")
f.write("---\n\n")
print(f"✅ Markdown 周报已保存: {os.path.abspath(file_name)}")
return os.path.abspath(file_name)
def markdown_to_pdf(md_path, css_path, pdf_path=None):
"""[STEP 6] 将 Markdown 转为 PDF"""
if not os.path.exists(md_path):
print(f"❌ Markdown 文件不存在: {md_path}")
return None
if not os.path.exists(css_path):
print(f"❌ CSS 文件不存在: {css_path}")
return None
if pdf_path is None:
pdf_path = os.path.splitext(md_path)[0] + ".pdf"
with open(md_path, "r", encoding="utf-8") as f:
md_text = f.read()
html_body = md.markdown(md_text, extensions=["extra", "tables", "sane_lists"])
html_doc = (
"<!doctype html>"
"<html><head><meta charset='utf-8'></head>"
f"<body><div id='write'>{html_body}</div></body></html>"
)
base_url = os.path.dirname(os.path.abspath(md_path))
HTML(string=html_doc, base_url=base_url).write_pdf(
pdf_path,
stylesheets=[CSS(filename=css_path)]
)
print(f"✅ PDF 已生成: {os.path.abspath(pdf_path)}")
return os.path.abspath(pdf_path)
def send_pdf_via_email(pdf_path, subject, to_email):
"""[STEP 7] 通过 163 邮箱发送 PDF"""
if not MAIL_SENDER or not MAIL_APP_PASSWORD or not to_email:
print("⚠️ 邮箱配置为空,跳过发送。")
return False
if not os.path.exists(pdf_path):
print(f"❌ PDF 文件不存在: {pdf_path}")
return False
msg = EmailMessage()
msg["Subject"] = subject
msg["From"] = MAIL_SENDER
msg["To"] = to_email
msg.set_content("技术前沿周报已生成,详见附件 PDF。")
with open(pdf_path, "rb") as f:
pdf_data = f.read()
msg.add_attachment(
pdf_data,
maintype="application",
subtype="pdf",
filename=os.path.basename(pdf_path)
)
try:
with smtplib.SMTP_SSL(MAIL_SMTP_SERVER, MAIL_SMTP_PORT) as server:
server.login(MAIL_SENDER, MAIL_APP_PASSWORD)
server.send_message(msg)
print(f"✅ 邮件已发送: {to_email}")
return True
except Exception as e:
print(f"❌ 邮件发送失败: {e}")
return False
# ================= 3. 执行入口 =================
if __name__ == "__main__":
start_time = time.time()
# 1. 读取配置
configs = get_journal_configs()
if configs:
# 2. 逐一关键词检索
paper_tasks = fetch_dois_crossref(configs)
if paper_tasks:
# 3. 补充内容 (SerpApi)
enriched = enrich_content_serpapi(paper_tasks)
# 4. AI 处理 (DeepSeek)
final_reports = summarize_with_ds(enriched)
# 5. 打印汇总预览
print("\n" + "=" * 60)
print(f"📊 最终简报汇总预览 - {datetime.date.today()}")
print("=" * 60)
for r in final_reports:
print(f"\n{r['journal']}】(关键词: {r.get('hit_keyword')})")
print(f"中文标题:{r.get('title_zh', 'N/A')}")
print(f"AI 总结:{r.get('summary_zh', 'N/A')}")
print(f"链接: https://doi.org/{r['doi']}")
print("-" * 40)
# 6. 生成文件
md_path = save_to_markdown(final_reports)
# 7. 生成 PDF + 发送邮件
base_dir = os.path.dirname(os.path.abspath(__file__))
css_path = os.path.join(base_dir, "github.css")
pdf_path = markdown_to_pdf(md_path, css_path)
if pdf_path:
today_str = datetime.date.today().strftime("%Y-%m-%d")
subject = f"{today_str}技术前沿周报"
send_pdf_via_email(pdf_path, subject, MAIL_RECIPIENT)
else:
print("\n📭 最近一周内未搜索到相关关键词的论文。")
print(f"\n✨ 总耗时: {round(time.time() - start_time, 2)}s")