arXiv Affiliation Highlighter (并发+进度条版)

在页面上的 arXiv 链接旁自动标注论文机构列表（并发8个+进度条）
Ask a question, post a review, or report the script.
Wrap lines
// ==UserScript==
// @license MIT
// @name         arXiv Affiliation Highlighter (并发+进度条版)
// @namespace    http://tampermonkey.net/
// @version      1.5
// @description  在页面上的 arXiv 链接旁自动标注论文机构列表（并发8个+进度条）
// @author       Zezhou Wang
// @match        *://*/*
// @grant        GM_xmlhttpRequest
// @grant        GM_getValue
// @grant        GM_setValue
// @grant        GM_registerMenuCommand
// @connect      arxiv.org
// @connect      api.openai.com
// @require      https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.8.162/pdf.min.js
// ==/UserScript==
 
;(async function() {
  'use strict';
 
  let OPENAI_API_KEY = GM_getValue('OPENAI_API_KEY', '');
  if (!OPENAI_API_KEY) {
    OPENAI_API_KEY = prompt('Please enter your OpenAI API Key:');
    if (OPENAI_API_KEY) {
      GM_setValue('OPENAI_API_KEY', OPENAI_API_KEY);
    } else {
      alert('No OpenAI API Key provided — script will stop.');
      return;
    }
  }
  const MODEL = 'gpt-4o-mini';
  const CACHE_KEY = 'arxivAffCache';
  const CACHE_TTL = 1000 * 60 * 60 * 24 * 365;
 
  pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.8.162/pdf.worker.min.js';
  GM_registerMenuCommand('🗑 清空 arXiv 机构缓存', () => {
    GM_setValue(CACHE_KEY, {});
    alert('已清空 arXiv 机构缓存');
  });
 
  let cache = GM_getValue(CACHE_KEY, {});
  const links = Array.from(
    document.querySelectorAll('a[href*="arxiv.org/abs/"], a[href*="arxiv.org/pdf/"]')
  );
 
  if (links.length === 0) return;
 
  // —— 添加进度条 —— //
  const progressBar = createProgressBar();
  updateProgressBar(0, links.length);
 
  let completed = 0;
 
  const tasks = links.map(link => async () => {
    const id = extractArxivId(link.href);
    if (!id) {
      incrementProgress();
      return;
    }
    let title = id;
    try {
      title = await fetchTitle(id);
    } catch (e) {
      // 忽略标题抓取失败
    }
    try {
      const entry = cache[id];
      if (entry && Date.now() - entry.ts < CACHE_TTL) {
        annotate(link, entry.affs);
      } else {
        const affs = await fetchAndAnnotate(id);
        annotate(link, affs);
        cache[id] = { affs, ts: Date.now() };
        GM_setValue(CACHE_KEY, cache);
      }
      incrementProgress();
    } catch (err) {
      console.error(err);
      alert('处理 arXiv ID ' + id + ' 时发生错误：\n' + err.message);
      removeProgressBar();
      throw err;
    }
  });
 
  try {
    await runTasksWithConcurrency(tasks, 8);
  } catch (e) {
    console.error('脚本执行中止：', e);
    // 错误时 progress bar 已经移除
  }
  removeProgressBar();
 
  // —— 辅助函数 —— //
 
  function extractArxivId(url) {
    const m = url.match(/arxiv\.org\/(?:abs|pdf)\/([\d\.v]+)/);
    return m ? m[1] : null;
  }
 
  function gmFetchText(url) {
    return new Promise((resolve, reject) => {
      GM_xmlhttpRequest({
        method: 'GET',
        url,
        responseType: 'text',
        onload: res => res.status === 200 ? resolve(res.response) : reject(new Error(`HTTP ${res.status}`)),
        onerror: () => reject(new Error('Network error')),
      });
    });
  }
 
  async function fetchTitle(id) {
    const html = await gmFetchText(`https://arxiv.org/abs/${id}`);
    const doc = new DOMParser().parseFromString(html, 'text/html');
    const h1 = doc.querySelector('h1.title');
    if (h1) {
      return h1.textContent.replace(/^Title:\s*/i, '').trim();
    }
    const ti = doc.querySelector('title');
    if (ti) {
      const m = ti.textContent.match(/^(.+?)\s*\|/);
      if (m) return m[1].trim();
      return ti.textContent.trim();
    }
    return id;
  }
 
  async function fetchAndAnnotate(id) {
    const buffer = await gmFetchPdf(`https://arxiv.org/pdf/${id}.pdf`);
    const txt = await extractFirstPageText(buffer);
    return await gmOpenAIExtractAffs(txt);
  }
 
  function gmFetchPdf(url) {
    return new Promise((resolve, reject) => {
      GM_xmlhttpRequest({
        method: 'GET',
        url,
        responseType: 'arraybuffer',
        onload: res =>
          res.status === 200 ? resolve(res.response) : reject(new Error(`PDF 下载失败：${res.status}`)),
        onerror: () => reject(new Error('PDF 下载错误')),
      });
    });
  }
 
  async function extractFirstPageText(buffer) {
    const pdf = await pdfjsLib.getDocument({ data: buffer }).promise;
    const page = await pdf.getPage(1);
    const content = await page.getTextContent();
    return content.items.map(i => i.str).join(' ');
  }
 
  function gmOpenAIExtractAffs(text) {
    const prompt = `
Here is an example to illustrate the desired output format:
 
Example input (paper first page snippet):
"Alice is from Tsinghua University; Bob is from Peking University; Carol is also from Tsinghua University."
 
Example output (one institution per line):
Tsinghua University
Peking University
 
Now please:
1) Extract all author affiliations from the first page text below.
2) Output one affiliation per line, with no numbering or extra commentary.
3) Ensure each institution appears only once (deduplication will also be applied in the script).
 
First page text:
${text}
 
Please start listing the affiliations, one per line:`;
 
    return new Promise((resolve, reject) => {
      GM_xmlhttpRequest({
        method: 'POST',
        url: 'https://api.openai.com/v1/chat/completions',
        headers: {
          'Content-Type': 'application/json',
          'Authorization': `Bearer ${OPENAI_API_KEY}`,
        },
        data: JSON.stringify({
          model: MODEL,
          messages: [{ role: 'user', content: prompt }],
          temperature: 0,
        }),
        responseType: 'json',
        onload: res => {
          if (res.status === 200) {
            try {
              const lines = res.response.choices[0].message.content
                .split('\n')
                .map(l => l.trim())
                .filter(l => l);
              resolve(Array.from(new Set(lines)));
            } catch {
              reject(new Error('Failed to parse GPT response'));
            }
          } else {
            reject(new Error(`OpenAI request failed: ${res.status}`));
          }
        },
        onerror: () => reject(new Error('Network error during OpenAI request')),
      });
    });
  }
 
  function annotate(linkEl, affs) {
    if (!affs || !affs.length) return;
    const span = document.createElement('span');
    span.textContent = affs.join(', ');
    span.style.cssText = `
      background: #fffbdd;
      color: #333;
      padding: 2px 4px;
      margin-left: 6px;
      border-radius: 3px;
      font-size: 90%;
      font-family: sans-serif;
    `;
    linkEl.after(span);
  }
 
  async function runTasksWithConcurrency(tasks, concurrency) {
    const executing = new Set();
    for (const task of tasks) {
      const p = task();
      executing.add(p);
      const clean = () => executing.delete(p);
      p.then(clean).catch(err => {
        executing.delete(p);
        throw err;
      });
      if (executing.size >= concurrency) {
        await Promise.race(executing);
      }
    }
    await Promise.all(executing);
  }
 
  // —— 进度条相关 —— //
  function createProgressBar() {
    const bar = document.createElement('div');
    bar.id = 'arxiv-aff-progress';
    bar.style.cssText = `
      position: fixed;
      top: 10px;
      right: 10px;
      background: rgba(0, 0, 0, 0.7);
      color: white;
      padding: 8px 12px;
      border-radius: 8px;
      font-size: 14px;
      z-index: 9999;
      font-family: sans-serif;
    `;
    document.body.appendChild(bar);
    return bar;
  }
 
  function updateProgressBar(done, total) {
    const bar = document.getElementById('arxiv-aff-progress');
    if (bar) {
      bar.textContent = `🔄 处理中：${done} / ${total}`;
    }
  }
 
  function incrementProgress() {
    completed++;
    updateProgressBar(completed, links.length);
  }
 
  function removeProgressBar() {
    const bar = document.getElementById('arxiv-aff-progress');
    if (bar) {
      bar.remove();
    }
  }
 
})();