335 lines
12 KiB
Python
335 lines
12 KiB
Python
import requests
|
||
import datetime
|
||
import time
|
||
import os
|
||
import smtplib
|
||
from email.message import EmailMessage
|
||
from habanero import Crossref
|
||
import markdown as md
|
||
from weasyprint import HTML, CSS
|
||
|
||
|
||
# ================= 1. 配置中心 =================
|
||
FEISHU_APP_ID = "cli_a9d25c8530785cc8"
|
||
FEISHU_APP_SECRET = "5n00X2JtvKoeWmwcPVRKkcDgnoLMoNGb"
|
||
SPREADSHEET_TOKEN = "JyyEsR8tYh9Q2rt08v7cogWznJg"
|
||
SHEET_ID = "3105e6"
|
||
|
||
DS_API_KEY = "sk-a8e71892ed7f478eb60319c231f9c3c2"
|
||
DS_API_URL = "https://api.deepseek.com/chat/completions"
|
||
|
||
SERP_API_KEY = "e2778d6230fb7b81584e875344dd5cd38c8c5679f2b0dd4d1fd1cee7a1461a44"
|
||
|
||
MAIL_SENDER = "lmyleo7@163.com" # 请替换为你的163邮箱
|
||
MAIL_APP_PASSWORD = "VDcsViqCvGUHg3V4" # 请替换为你的163邮箱授权码
|
||
MAIL_RECIPIENT = "kwei@zju.edu.cn"
|
||
MAIL_SMTP_SERVER = "smtp.163.com"
|
||
MAIL_SMTP_PORT = 465
|
||
|
||
TARGET_KEYWORDS = [
|
||
"Silicon Photonics",
|
||
"Photonic Integrated Circuits",
|
||
"Nanophotonics",
|
||
"Metasurface",
|
||
"Metamaterials",
|
||
"Computational Imaging",
|
||
"Plasmonics",
|
||
"Optoelectronics",
|
||
"Diffractive Optics",
|
||
"Optical Interconnects",
|
||
"On-chip Photonics",
|
||
"Flat Optics"
|
||
]
|
||
|
||
|
||
# ================= 2. 核心功能函数 =================
|
||
|
||
def get_journal_configs():
|
||
"""[STEP 1] 从飞书读取期刊配置"""
|
||
print("\n--- [STEP 1] 正在读取飞书期刊配置 ---")
|
||
try:
|
||
auth_url = "https://open.feishu.cn/open-apis/auth/v3/tenant_access_token/internal"
|
||
token_res = requests.post(auth_url, json={"app_id": FEISHU_APP_ID, "app_secret": FEISHU_APP_SECRET}).json()
|
||
token = token_res.get("tenant_access_token")
|
||
|
||
read_url = f"https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/{SPREADSHEET_TOKEN}/values_batch_get"
|
||
headers = {"Authorization": f"Bearer {token}"}
|
||
res = requests.get(read_url, params={"ranges": f"{SHEET_ID}!A2:C50"}, headers=headers).json()
|
||
|
||
value_ranges = res.get("data", {}).get("valueRanges", [])
|
||
if not value_ranges: return []
|
||
rows = value_ranges[0].get("values", [])
|
||
|
||
configs = []
|
||
for r in rows:
|
||
if r and len(r) >= 1 and r[0] and str(r[0]).strip():
|
||
configs.append({
|
||
"name": str(r[0]).strip(),
|
||
"issn": str(r[1]).strip() if (len(r) > 1 and r[1]) else None,
|
||
"e_issn": str(r[2]).strip() if (len(r) > 2 and r[2]) else None
|
||
})
|
||
print(f"✅ 成功加载 {len(configs)} 条有效期刊配置")
|
||
return configs
|
||
except Exception as e:
|
||
print(f"❌ 飞书配置读取失败: {e}")
|
||
return []
|
||
|
||
|
||
def fetch_dois_crossref(journal_configs):
|
||
"""[STEP 2] 通过 Crossref 挨个关键词检索最近 7 天的论文"""
|
||
print(f"\n--- [STEP 2] 正在 Crossref 检索 (按关键词逐一扫描) ---")
|
||
cr = Crossref(mailto="hextorize@gmail.com")
|
||
start_date = (datetime.date.today() - datetime.timedelta(days=7)).strftime("%Y-%m-%d")
|
||
|
||
tasks = []
|
||
seen_dois = set() # 用于去重
|
||
|
||
for j in journal_configs:
|
||
print(f"\n >>> 正在扫描期刊: {j['name']}")
|
||
target_issn = j['e_issn'] or j['issn']
|
||
|
||
# 遍历每一个关键词进行搜索
|
||
for kw in TARGET_KEYWORDS:
|
||
print(f" - 搜索关键词: {kw}...", end="", flush=True)
|
||
|
||
filters = {'from-pub-date': start_date}
|
||
if target_issn:
|
||
filters['issn'] = target_issn
|
||
query_str = kw
|
||
else:
|
||
# 如果没有 ISSN,强制在 query 中包含期刊名
|
||
query_str = f'"{j["name"]}" {kw}'
|
||
|
||
try:
|
||
# 每个关键词限制搜 1-2 篇,避免任务量过大
|
||
res = cr.works(filter=filters, query=query_str, limit=3, select="title,DOI")
|
||
items = res['message'].get('items', [])
|
||
|
||
new_found = 0
|
||
for art in items:
|
||
doi = art['DOI']
|
||
if doi not in seen_dois:
|
||
tasks.append({
|
||
"title_en": art['title'][0] if art.get('title') else "No Title",
|
||
"doi": doi,
|
||
"journal": j['name'],
|
||
"hit_keyword": kw # 记录是被哪个词命中的
|
||
})
|
||
seen_dois.add(doi)
|
||
new_found += 1
|
||
print(f" 新找到 {new_found} 篇")
|
||
except Exception as e:
|
||
print(" 跳过 (请求异常)")
|
||
|
||
print(f"\n✅ 扫描结束,共锁定 {len(tasks)} 篇唯一论文")
|
||
return tasks
|
||
|
||
|
||
def enrich_content_serpapi(tasks):
|
||
"""[STEP 3] 利用 SerpApi 提取 Google Scholar 片段"""
|
||
print(f"\n--- [STEP 3] 正在通过 SerpApi 提取核心内容 ---")
|
||
for i, task in enumerate(tasks):
|
||
print(f" [{i + 1}/{len(tasks)}] 检索 DOI: {task['doi']}...", end="", flush=True)
|
||
try:
|
||
params = {
|
||
"engine": "google_scholar",
|
||
"q": f"DOI:{task['doi']}",
|
||
"api_key": SERP_API_KEY,
|
||
"hl": "en",
|
||
}
|
||
res = requests.get("https://serpapi.com/search.json", params=params, timeout=30)
|
||
res.raise_for_status()
|
||
results = res.json().get("organic_results", [])
|
||
task['snippet_en'] = results[0].get(
|
||
"snippet", "No abstract snippet available."
|
||
) if results else "No content found."
|
||
print(" [OK]")
|
||
except Exception as e:
|
||
task['snippet_en'] = f"Search Error: {e}"
|
||
print(" [Error]")
|
||
time.sleep(0.6) # 稍微延迟避免 API 限制
|
||
return tasks
|
||
|
||
|
||
def summarize_with_ds(tasks):
|
||
"""[STEP 4] 调用 DeepSeek 进行处理"""
|
||
print(f"\n--- [STEP 4] 正在请求 DeepSeek 处理内容 ---")
|
||
headers = {"Authorization": f"Bearer {DS_API_KEY}", "Content-Type": "application/json"}
|
||
for i, task in enumerate(tasks):
|
||
print(f" > 处理第 {i + 1} 篇...", end="", flush=True)
|
||
prompt = (f"你是一个光学专家。请翻译以下论文标题和摘要为中文,并总结核心创新点(200字内)。\n"
|
||
f"标题: {task['title_en']}\n摘要: {task['snippet_en']}\n"
|
||
f"格式要求:\n中文标题:xxx\n中文摘要:xxx\nAI总结:xxx")
|
||
|
||
payload = {
|
||
"model": "deepseek-chat",
|
||
"messages": [{"role": "system", "content": "你是一个严谨的科研助手。"}, {"role": "user", "content": prompt}]
|
||
}
|
||
try:
|
||
res = requests.post(DS_API_URL, json=payload, headers=headers, timeout=30).json()
|
||
content = res['choices'][0]['message']['content'].strip()
|
||
|
||
# 解析 DeepSeek 返回的格式
|
||
for line in content.split('\n'):
|
||
if "中文标题:" in line: task['title_zh'] = line.split(":", 1)[1].strip()
|
||
if "中文摘要:" in line: task['snippet_zh'] = line.split(":", 1)[1].strip()
|
||
if "AI总结:" in line: task['summary_zh'] = line.split(":", 1)[1].strip()
|
||
print(" [完成]")
|
||
except:
|
||
print(" [失败]")
|
||
return tasks
|
||
|
||
|
||
def save_to_markdown(tasks):
|
||
"""[STEP 5] 保存为 Markdown 报告"""
|
||
print(f"\n--- [STEP 5] 正在生成 Markdown 报告 ---")
|
||
today_str = datetime.date.today().strftime("%Y-%m-%d")
|
||
file_name = f"{today_str}周报.md"
|
||
|
||
with open(file_name, "w", encoding="utf-8") as f:
|
||
f.write(f"# 光学行业学术动态周报 ({today_str})\n\n")
|
||
|
||
f.write("## 目录\n")
|
||
for idx, task in enumerate(tasks, start=1):
|
||
title = task.get('title_zh', 'N/A')
|
||
anchor = f"task-{idx}"
|
||
f.write(f"- [{idx}. {title}](#{anchor})\n")
|
||
f.write("\n---\n\n")
|
||
|
||
for idx, task in enumerate(tasks):
|
||
f.write(f"### {idx + 1}. {task.get('title_zh', 'N/A')}\n\n")
|
||
f.write(f"**期刊**: {task['journal']} | **匹配关键词**: {task.get('hit_keyword', 'N/A')} \n")
|
||
f.write(f"**DOI**: [{task['doi']}](https://doi.org/{task['doi']})\n\n")
|
||
|
||
f.write(f"#### 【标题】\n")
|
||
f.write(f"- **中文**: {task.get('title_zh', 'N/A')}\n")
|
||
f.write(f"- **英文**: {task['title_en']}\n\n")
|
||
|
||
f.write(f"#### 【摘要原文】\n")
|
||
f.write(f"> *{task.get('snippet_en', 'N/A')}*\n\n")
|
||
|
||
f.write(f"#### 【摘要翻译】\n")
|
||
f.write(f"> {task.get('snippet_zh', 'N/A')}\n\n")
|
||
|
||
f.write(f"#### 【AI 深度总结】\n")
|
||
f.write(f"{task.get('summary_zh', 'N/A')}\n\n")
|
||
|
||
f.write("---\n\n")
|
||
|
||
print(f"✅ Markdown 周报已保存: {os.path.abspath(file_name)}")
|
||
return os.path.abspath(file_name)
|
||
|
||
|
||
def markdown_to_pdf(md_path, css_path, pdf_path=None):
|
||
"""[STEP 6] 将 Markdown 转为 PDF"""
|
||
if not os.path.exists(md_path):
|
||
print(f"❌ Markdown 文件不存在: {md_path}")
|
||
return None
|
||
if not os.path.exists(css_path):
|
||
print(f"❌ CSS 文件不存在: {css_path}")
|
||
return None
|
||
|
||
if pdf_path is None:
|
||
pdf_path = os.path.splitext(md_path)[0] + ".pdf"
|
||
|
||
|
||
|
||
with open(md_path, "r", encoding="utf-8") as f:
|
||
md_text = f.read()
|
||
|
||
html_body = md.markdown(md_text, extensions=["extra", "tables", "sane_lists"])
|
||
html_doc = (
|
||
"<!doctype html>"
|
||
"<html><head><meta charset='utf-8'></head>"
|
||
f"<body><div id='write'>{html_body}</div></body></html>"
|
||
)
|
||
|
||
base_url = os.path.dirname(os.path.abspath(md_path))
|
||
HTML(string=html_doc, base_url=base_url).write_pdf(
|
||
pdf_path,
|
||
stylesheets=[CSS(filename=css_path)]
|
||
)
|
||
|
||
print(f"✅ PDF 已生成: {os.path.abspath(pdf_path)}")
|
||
return os.path.abspath(pdf_path)
|
||
|
||
|
||
def send_pdf_via_email(pdf_path, subject, to_email):
|
||
"""[STEP 7] 通过 163 邮箱发送 PDF"""
|
||
if not MAIL_SENDER or not MAIL_APP_PASSWORD or not to_email:
|
||
print("⚠️ 邮箱配置为空,跳过发送。")
|
||
return False
|
||
if not os.path.exists(pdf_path):
|
||
print(f"❌ PDF 文件不存在: {pdf_path}")
|
||
return False
|
||
|
||
msg = EmailMessage()
|
||
msg["Subject"] = subject
|
||
msg["From"] = MAIL_SENDER
|
||
msg["To"] = to_email
|
||
msg.set_content("技术前沿周报已生成,详见附件 PDF。")
|
||
|
||
with open(pdf_path, "rb") as f:
|
||
pdf_data = f.read()
|
||
|
||
msg.add_attachment(
|
||
pdf_data,
|
||
maintype="application",
|
||
subtype="pdf",
|
||
filename=os.path.basename(pdf_path)
|
||
)
|
||
|
||
try:
|
||
with smtplib.SMTP_SSL(MAIL_SMTP_SERVER, MAIL_SMTP_PORT) as server:
|
||
server.login(MAIL_SENDER, MAIL_APP_PASSWORD)
|
||
server.send_message(msg)
|
||
print(f"✅ 邮件已发送: {to_email}")
|
||
return True
|
||
except Exception as e:
|
||
print(f"❌ 邮件发送失败: {e}")
|
||
return False
|
||
|
||
|
||
# ================= 3. 执行入口 =================
|
||
if __name__ == "__main__":
|
||
start_time = time.time()
|
||
|
||
# 1. 读取配置
|
||
configs = get_journal_configs()
|
||
|
||
if configs:
|
||
# 2. 逐一关键词检索
|
||
paper_tasks = fetch_dois_crossref(configs)
|
||
|
||
if paper_tasks:
|
||
# 3. 补充内容 (SerpApi)
|
||
enriched = enrich_content_serpapi(paper_tasks)
|
||
|
||
# 4. AI 处理 (DeepSeek)
|
||
final_reports = summarize_with_ds(enriched)
|
||
|
||
# 5. 打印汇总预览
|
||
print("\n" + "=" * 60)
|
||
print(f"📊 最终简报汇总预览 - {datetime.date.today()}")
|
||
print("=" * 60)
|
||
for r in final_reports:
|
||
print(f"\n【{r['journal']}】(关键词: {r.get('hit_keyword')})")
|
||
print(f"中文标题:{r.get('title_zh', 'N/A')}")
|
||
print(f"AI 总结:{r.get('summary_zh', 'N/A')}")
|
||
print(f"链接: https://doi.org/{r['doi']}")
|
||
print("-" * 40)
|
||
|
||
# 6. 生成文件
|
||
md_path = save_to_markdown(final_reports)
|
||
|
||
# 7. 生成 PDF + 发送邮件
|
||
base_dir = os.path.dirname(os.path.abspath(__file__))
|
||
css_path = os.path.join(base_dir, "github.css")
|
||
pdf_path = markdown_to_pdf(md_path, css_path)
|
||
if pdf_path:
|
||
today_str = datetime.date.today().strftime("%Y-%m-%d")
|
||
subject = f"{today_str}技术前沿周报"
|
||
send_pdf_via_email(pdf_path, subject, MAIL_RECIPIENT)
|
||
else:
|
||
print("\n📭 最近一周内未搜索到相关关键词的论文。")
|
||
print(f"\n✨ 总耗时: {round(time.time() - start_time, 2)}s") |