Feishu Doc Markdown Scraper

⚡功能:以Markdown格式复制文档内容; ⚡使用方法:点击[准备复制],然后等自动滑动到底部后,点击[复制]即可; ⚡因为飞书文档本身不支持导出Markdown,所以做了本插件,调试时发现飞书的文档加载是随着页面滚动而动态加载的,所以最终只能这么实现了。。

このスクリプトの質問や評価の投稿はこちら通報はこちらへお寄せください。
  1. // ==UserScript==
  2. // @name Feishu Doc Markdown Scraper
  3. // @namespace http://tampermonkey.net/
  4. // @version 0.1.1
  5. // @description ⚡功能:以Markdown格式复制文档内容; ⚡使用方法:点击[准备复制],然后等自动滑动到底部后,点击[复制]即可; ⚡因为飞书文档本身不支持导出Markdown,所以做了本插件,调试时发现飞书的文档加载是随着页面滚动而动态加载的,所以最终只能这么实现了。。
  6. // @author Yearly
  7. // @match *://*.feishu.cn/docx/*
  8. // @match *://*.feishu.cn/wiki/*
  9. // @license AGPL-v3.0
  10. // @grant GM_setClipboard
  11. // @grant GM_addStyle
  12. // @homepage https://greatest.deepsurf.us/zh-CN/scripts/497029-feishu-doc-markdown-scraper
  13. // @icon 
  14. // ==/UserScript==
  15.  
  16. (function() {
  17. 'use strict';
  18.  
  19. function convertToMarkdown(html) {
  20. // 首先使用正则表达式进行简单的标签替换
  21. let markdown = html
  22. .replace(/<b>(.*?)<\/b>/gi, '**$1**')
  23. .replace(/<i>(.*?)<\/i>/gi, '*$1*')
  24. .replace(/<strong>(.*?)<\/strong>/gi, '**$1**')
  25. .replace(/<em>(.*?)<\/em>/gi, '*$1*')
  26. .replace(/<h1.*?>(.*?)<\/h1>/gi, '# $1\n')
  27. .replace(/<h2.*?>(.*?)<\/h2>/gi, '## $1\n')
  28. .replace(/<h3.*?>(.*?)<\/h3>/gi, '### $1\n')
  29. .replace(/<h4.*?>(.*?)<\/h3>/gi, '#### $1\n')
  30. .replace(/<h5.*?>(.*?)<\/h3>/gi, '##### $1\n')
  31. .replace(/<p>(.*?)<\/p>/gi, '$1\n\n')
  32. .replace(/<br\s*\/?>/gi, '\n')
  33. .replace(/<a href="(.*?)">(.*?)<\/a>/gi, '[$2]($1)')
  34. .replace(/<code>(.*?)<\/code>/gi, '`$1`');
  35.  
  36. // 使用DOM解析处理更复杂的标签和结构
  37. const parser = new DOMParser();
  38. const doc = parser.parseFromString(markdown, 'text/html');
  39.  
  40. // 处理列表嵌套
  41. function countParents(node) {
  42. let depth = 0;
  43. while (node.parentNode) {
  44. node = node.parentNode;
  45.  
  46. if(node.tagName){
  47. if (node.tagName.toUpperCase() === 'UL' || node.tagName.toUpperCase() === 'OL') {
  48. depth++;
  49. }
  50. }
  51. }
  52. return depth;
  53. }
  54. // 处理列表
  55. function processList(element) {
  56. let md = '';
  57. let depth = countParents(element);
  58. let index = null;
  59. if (element.tagName.toUpperCase() === 'OL'){
  60. index = 1;
  61. }
  62. element.childNodes.forEach(node => {
  63. if (node.tagName && node.tagName.toLowerCase() === 'li') {
  64. if(index != null) {
  65. md += '<span> </span>'.repeat(depth*2) + `${index++}\. ${node.textContent.trim()}\n`;
  66. } else {
  67. md += '<span> </span>'.repeat(depth*2) + `- ${node.textContent.trim()}\n`;
  68. }
  69. }
  70. });
  71. return md;
  72. }
  73. let listsArray = Array.from( doc.querySelectorAll('ol, ul'));
  74. listsArray.reverse();
  75. listsArray.forEach(list => {
  76. list.outerHTML = processList(list);
  77. });
  78.  
  79. // heading 处理
  80. doc.querySelectorAll('div.heading').forEach(multifile => {
  81. if ( multifile.classList.contains("heading-h1") ) {
  82. multifile.innerHTML = `\n\n# ${multifile.textContent}\n`;
  83. } else if (multifile.classList.contains("heading-h2")) {
  84. multifile.innerHTML = `\n\n## ${multifile.textContent}\n`;
  85. } else if (multifile.classList.contains("heading-h3")) {
  86. multifile.innerHTML = `\n\n### ${multifile.textContent}\n`;
  87. } else if (multifile.classList.contains("heading-h4")) {
  88. multifile.innerHTML = `\n\n#### ${multifile.textContent}\n`;
  89. } else if (multifile.classList.contains("heading-h5")) {
  90. multifile.innerHTML = `\n\n##### ${multifile.textContent}\n`;
  91. }
  92. });
  93.  
  94. // img处理
  95. doc.querySelectorAll("img[src]").forEach(multifile => {
  96. multifile.innerHTML = `\n![image](${multifile.src})\n`
  97. });
  98.  
  99. // 文件框处理
  100. doc.querySelectorAll("div.chat-uikit-multi-modal-file-image-content").forEach(multifile => {
  101. multifile.innerHTML = multifile.innerHTML
  102. .replace(/<span class="chat-uikit-file-card__info__size">(.*?)<\/span>/gi, '\n$1');
  103. multifile.innerHTML = `\n\`\`\`file\n${multifile.textContent}\n\`\`\`\n`;
  104. });
  105.  
  106. // code-block
  107. doc.querySelectorAll("div.docx-code-block-container > div.docx-code-block-inner-container").forEach(codearea => {
  108. let header = codearea.querySelector("div.code-block-header .code-block-header-btn-con");
  109. let language = header.textContent;
  110. codearea.querySelector("div.code-block-header").remove();
  111. codearea.querySelectorAll('span[data-enter="true"]').forEach(item_enter => {
  112. item_enter.outerHTML = "<p>\n</p>";
  113. });
  114. let code_content = codearea.innerText.toString();
  115. codearea.outerHTML = `\n\`\`\`${language}\n${code_content}\n\`\`\`\n`;
  116. });
  117.  
  118. // 获取最终Markdown文本
  119. markdown = doc.body.innerText ||doc.body.textContent;
  120.  
  121. return markdown.replaceAll(":", "\\:");;
  122. }
  123.  
  124. // 等待目标DIV加载完成
  125. function waitForElement(selector, callback) {
  126. const observer = new MutationObserver(() => {
  127. const element = document.querySelector(selector);
  128. if (element) {
  129. observer.disconnect();
  130. callback(element);
  131. }
  132. });
  133. observer.observe(document.body, { childList: true, subtree: true });
  134. }
  135.  
  136. // 初始化数据
  137. const dataBlocks = new Map();
  138. let isScrolling = false;
  139.  
  140. // 获取所有的 data-block-id 元素并存储其内容
  141. function scrapeDataBlocks() {
  142. const blocks = document.querySelectorAll('#docx > div div[data-block-id]');
  143. blocks.forEach(block => {
  144. const id = block.getAttribute('data-block-id');
  145. if (!dataBlocks.has(id)) {
  146.  
  147. const type = block.getAttribute('data-block-type');
  148. // dataBlocks.set(id, block.innerHTML);
  149. // dataBlocks.set(id, block.innerText);
  150. if(type == "page") {
  151. dataBlocks.set(id, convertToMarkdown(block.querySelector('div.page-block-content').innerHTML));
  152. } else if (type != "back_ref_list") {
  153. dataBlocks.set(id, convertToMarkdown(block.innerHTML)) ;
  154. }
  155. //console.log( "add:" + id);
  156. }
  157. });
  158. }
  159.  
  160. // 滚动页面并获取所有的 data-block-id 元素
  161. function scrollAndScrape(container) {
  162. if (isScrolling) return;
  163. isScrolling = true;
  164. let currentY = 0;
  165. let percent = 0;
  166.  
  167. function scroll() {
  168. currentY += container.clientHeight / 3;
  169. container.scrollTo({
  170. top: currentY,
  171. behavior: "smooth",
  172. duration: 333,
  173. });
  174.  
  175. let curPercent = (currentY + container.clientHeight) / container.scrollHeight;
  176. curPercent = (Math.min(1, curPercent * curPercent) * 100);
  177. percent = Math.max((curPercent + percent)/2, percent)
  178. //console.log( container.scrollTop.toFixed() +"+"+ container.clientHeight.toFixed() +" vs "+ container.scrollHeight.toFixed() + ", "+ percent.toFixed(1) + "%" );
  179. document.querySelector('button#scrollCopyButton').textContent = '请勿操作, 正在扫描内容: ' + percent.toFixed(1) + "%";
  180. document.querySelector('button#scrollCopyButton').disabled = true;
  181. document.querySelector('button#scrollCopyButton').style.cursor="not-allowed";
  182. }
  183.  
  184. function scrollData() {
  185. scrapeDataBlocks();
  186. console.log( 'scrolling '+ container.scrollTop.toFixed());
  187. if (Math.max(container.scrollTop,currentY) + container.clientHeight >= container.scrollHeight) {
  188. isScrolling = true;
  189. createCopyButton(true);
  190.  
  191. return;
  192. }
  193. scroll();
  194. setTimeout(scroll, 500);
  195. setTimeout(scroll, 1000);
  196. setTimeout(scrollData, 1600);// 控制滚动速度,防止太快导致页面未加载完
  197. }
  198. setTimeout(scrollData, 500);;
  199. }
  200.  
  201. // 点击开始扫描事件
  202. function SyncListener() {
  203. console.log("click sync");
  204. scrollAndScrape(document.querySelector('#docx > div'));
  205. }
  206.  
  207. // 点击复制事件
  208. function CopyListener() {
  209. console.log("click copy");
  210. const allContent = Array.from(dataBlocks.entries())
  211. .sort((a, b) => a[0] - b[0])
  212. .map(entry => entry[1])
  213. .join('\n');
  214. GM_setClipboard(allContent);
  215. alert('内容已复制到剪贴板');
  216. }
  217.  
  218. // 创建复制按钮
  219. function createCopyButton(mode=false) {
  220. let button = document.querySelector('button#scrollCopyButton');
  221. const md_icon = '<svg xmlns="http://www.w3.org/2000/svg" style="height:15px; padding-right:5px; fill:#fff; display:inline;" viewBox="0 0 640 512"><path d="M593.8 59.1H46.2C20.7 59.1 0 79.8 0 105.2v301.5c0 25.5 20.7 46.2 46.2 46.2h547.7c25.5 0 46.2-20.7 46.1-46.1V105.2c0-25.4-20.7-46.1-46.2-46.1zM338.5 360.6H277v-120l-61.5 76.9-61.5-76.9v120H92.3V151.4h61.5l61.5 76.9 61.5-76.9h61.5v209.2zm135.3 3.1L381.5 256H443V151.4h61.5V256H566z"/></svg>'
  222.  
  223. if(!button) {
  224. button = document.createElement('button');
  225. button.id = 'scrollCopyButton';
  226. button.innerHTML = md_icon + '准备复制';
  227. document.body.appendChild(button);
  228.  
  229. GM_addStyle(`
  230. #scrollCopyButton {
  231. position: fixed;
  232. top: 15px;
  233. right: 40%;
  234. padding: 6px 18px;
  235. font-size: 16px;
  236. background: #007bff;
  237. color: white;
  238. border: none;
  239. border-radius: 5px;
  240. cursor: pointer;
  241. z-index: 1000;
  242. display: flex;
  243. place-items: center;
  244. box-shadow: 0 0 3px #1117;
  245. }
  246. #scrollCopyButton:hover {
  247. background: #0056b3;
  248. }
  249. `);
  250.  
  251. button.addEventListener('click', SyncListener);
  252. }
  253.  
  254. if(!mode) {
  255. return;
  256. }
  257.  
  258. button.disabled = false;
  259. button.style.cursor="pointer";
  260. button.innerHTML = md_icon + '复制';
  261.  
  262. button.removeEventListener('click', SyncListener);
  263. button.addEventListener('click', CopyListener);
  264.  
  265. }
  266.  
  267. // 主函数
  268. waitForElement('#docx > div', (container) => {
  269. createCopyButton(false);
  270. });
  271.  
  272. })();