Markdown Grabber

markdown downloader

  1. // ==UserScript==
  2. // @name Markdown Grabber
  3. // @namespace http://tampermonkey.net/
  4. // @version 1.0
  5. // @description markdown downloader
  6. // @author 5ec1cff
  7. // @match *://*/*
  8. // @license AGPL
  9. // @grant unsafeWindow
  10. // @grant GM_registerMenuCommand
  11. // @grant GM_xmlhttpRequest
  12. // @connect *
  13. // ==/UserScript==
  14.  
  15. // 2021.12.24 Fri: 修正
  16. // 2022.03.15 Tue: 增加下载图片支持(默认启用)
  17. // 2023.03.01 Wed: 支持 xz.aliyun.cn ;下载图片附代 Referer
  18.  
  19. (function () {
  20. 'use strict';
  21.  
  22. if (window.top !== window) return; // 阻止在 iframe 启用
  23.  
  24. const downloadPics = true;
  25.  
  26. const picMap = new Map();
  27.  
  28. const console = unsafeWindow.console.context();
  29.  
  30. function getPictureKey(url) {
  31. return url;
  32. }
  33.  
  34. function getPicture(url) {
  35. const key = getPictureKey(url);
  36. if (picMap.get(url) == null) {
  37. picMap.set(url,
  38. new Promise((rs, rj) => {
  39. if (!downloadPics || !url?.startsWith("http")) {
  40. rs([key, url]);
  41. return;
  42. }
  43. GM_xmlhttpRequest({
  44. url: url,
  45. headers: { Referer: location.href },
  46. responseType: "blob",
  47. onload(r) {
  48. const fr = new FileReader();
  49. fr.onloadend = () => {
  50. console.log('load done:', url);
  51. rs([key, fr.result]);
  52. }
  53. fr.onerror = (e) => {
  54. rj(e);
  55. }
  56. fr.readAsDataURL(r.response);
  57. },
  58. onerror(e) {
  59. rj(e);
  60. },
  61. onabort(e) {
  62. rj(e);
  63. }
  64. })
  65. })
  66. );
  67. }
  68. return key;
  69. }
  70.  
  71. function parseSimpleStyle(e) {
  72. let r = '';
  73. switch (e.tagName.toLowerCase()) {
  74. case 'b':
  75. case 'strong':
  76. r += `**${parseSingleLine(e)}**`;
  77. break;
  78. case 'i':
  79. case 'em':
  80. r += `*${parseSingleLine(e)}*`;
  81. break;
  82. case 's':
  83. case 'strike':
  84. r += `~~${parseSingleLine(e)}~~`;
  85. break;
  86. case 'a': {
  87. if (e.href) {
  88. r += `[${parseSingleLine(e)}](${e.getAttribute('href')})`;
  89. }
  90. break;
  91. }
  92. case 'code':
  93. r += `\`${e.innerText}\``;
  94. break;
  95. case 'img':
  96. r += `\n![][${getPicture(e.src)}]\n`;
  97. break;
  98. default:
  99. r += parseSingleLine(e);
  100. }
  101. return r;
  102. }
  103.  
  104. function parseSingleLine(element) {
  105. if (element instanceof Text) return element.data.trim();
  106. let r = '';
  107. if (element instanceof HTMLElement) {
  108. for (let e of element.childNodes) {
  109. if (e instanceof Text) r += e.data;
  110. if (!(e instanceof HTMLElement)) continue;
  111. r += parseSimpleStyle(e);
  112. }
  113. }
  114. return r.trim();
  115. }
  116.  
  117. function isSingleLine(node) {
  118. return !node.querySelector('p,ul,ol,br');
  119. }
  120.  
  121. function parseNode(element) {
  122. let lines = [], singleLine = null;
  123. if (element instanceof HTMLElement) {
  124. for (let e of element.childNodes) {
  125. if (!(e instanceof HTMLElement) && !(e instanceof Text)) continue;
  126. let tagName;
  127. if (e instanceof Text) {
  128. tagName = 'TEXT';
  129. } else {
  130. tagName = e.tagName.toLowerCase();
  131. }
  132. switch (tagName) {
  133. case 'TEXT':
  134. case 'a':
  135. case 'b':
  136. case 'strong':
  137. case 'i':
  138. case 'em':
  139. case 's':
  140. case 'strike':
  141. case 'a':
  142. case 'code': {
  143. if (singleLine == null) singleLine = '';
  144. if (tagName == 'TEXT') {
  145. singleLine += e.data.trim();
  146. }
  147. else {
  148. singleLine += parseSimpleStyle(e);
  149. }
  150. continue;
  151. }
  152. default:
  153. if (singleLine != null) {
  154. lines.push(singleLine);
  155. singleLine = null;
  156. }
  157. }
  158.  
  159. switch (tagName) {
  160. // ignores
  161. case 'button':
  162. case 'style':
  163. case 'header':
  164. case 'script':
  165. continue;
  166. case 'p':
  167. lines.push(parseSingleLine(e) + '\n');
  168. break;
  169. case 'br':
  170. lines.push('\n');
  171. break;
  172. case 'ul':
  173. case 'ol': {
  174. lines.push('');
  175. let is_order = tagName == 'ol',
  176. j = 1;
  177. for (let item of e.childNodes) {
  178. let pref = is_order ? `${j}. ` : `- `;
  179. if (item instanceof HTMLLIElement) {
  180. if (!isSingleLine(item)) {
  181. let item_lines = parseNode(item);
  182. for (let i = 0; i < item_lines.length; i++) {
  183. const l = item_lines[i].trim()
  184. if (l) {
  185. lines.push(`${i==0?pref:' '}${item_lines[i]}`);
  186. }
  187. }
  188. } else {
  189. lines.push(`${pref}${parseSingleLine(item)}`);
  190. }
  191. j++;
  192. }
  193. }
  194. lines.push('');
  195. break;
  196. }
  197. case 'pre': {
  198. // debugger
  199. lines.push('```');
  200. lines.push(...(e.querySelector('code') || e).innerText.trim().split('\n'));
  201. lines.push('```');
  202. break;
  203. }
  204. case 'blockquote': {
  205. lines.push('');
  206. let item_lines = parseNode(e);
  207. for (let i = 0; i < item_lines.length; i++) {
  208. lines.push(`> ${item_lines[i]}`);
  209. }
  210. lines.push('');
  211. break;
  212. }
  213. case 'table': {
  214. lines.push('');
  215. let head = e.querySelector('thead');
  216. if (!head) {
  217. console.warn('unknown table!');
  218. // resolve body as normal tag
  219. let body;
  220. if (body = e.querySelector('tbody')) {
  221. lines.push(...parseNode(body));
  222. }
  223. continue;
  224. }
  225. let head_line = '|',
  226. sep_line = '|';
  227. for (let h of head.querySelectorAll('th')) {
  228. head_line += `${parseSingleLine(h)}|`;
  229. sep_line += `--|`
  230. }
  231. lines.push(head_line);
  232. lines.push(sep_line);
  233. let body = e.querySelector('tbody');
  234. for (let b of body.querySelectorAll('tr')) {
  235. let line = '|';
  236. for (let d of b.querySelectorAll('td')) {
  237. line += `${parseSingleLine(d)}|`;
  238. }
  239. lines.push(line);
  240. }
  241. lines.push('');
  242. break;
  243. }
  244. case 'hr':
  245. lines.push('\n---\n');
  246. break;
  247. case 'img':
  248. lines.push(`\n![][${getPicture(e.src)}]\n`);
  249. break;
  250. case 'figure': {
  251. if (e.classList.contains('highlight')) {
  252. let lang = e.classList[1] || '';
  253. let code = e.querySelector('td.code pre');
  254. if (code != null) {
  255. lines.push('```' + lang);
  256. lines.push(...code.innerText.trim().split('\n'));
  257. lines.push('```');
  258. break;
  259. }
  260. }
  261. // fallthrough
  262. }
  263. case 'td': {
  264. if (e.classList.contains('gutter')) continue;
  265. // fallthrough
  266. }
  267. default: {
  268. let r;
  269. if (r = tagName.match(/h(\d+)/)) {
  270. lines.push(`\n${'#'.repeat(Number(r[1]))} ${parseSingleLine(e)} \n`);
  271. } else {
  272. lines.splice(lines.length, 0, ...parseNode(e));
  273. }
  274. }
  275. }
  276. }
  277. if (singleLine != null) lines.push(singleLine);
  278. }
  279. return lines;
  280. }
  281.  
  282. function findArticle() {
  283. let article = document.body.querySelector('article');
  284. if (article) return article;
  285. article = document.body.querySelector('div.markdown-body,div.mod-content');
  286. if (article) return article;
  287. let maxChild = 0, node = null;
  288. for (let n of document.querySelectorAll('h1')) {
  289. if (n.parentNode && n.parentNode.childElementCount >= maxChild) {
  290. node = n.parentNode;
  291. }
  292. }
  293. return node;
  294. }
  295.  
  296. async function html2MD() {
  297. let article = findArticle();
  298. let title = document.querySelector('h1');
  299. let r = '';
  300. if (title) {
  301. r += `# ${parseSingleLine(title)}`;
  302. } else {
  303. r += `# ${document.title}`;
  304. }
  305. r += `\n${location.href}\n\n`;
  306. r += await nodeToMD(article);
  307. return r;
  308. }
  309.  
  310. async function nodeToMD(node) {
  311. picMap.clear();
  312. let r = '';
  313. let lines = parseNode(node);
  314. for (let l of lines) {
  315. r += `${l}\n`;
  316. }
  317.  
  318. let pics = await Promise.race([
  319. Promise.all(picMap.values()),
  320. new Promise((_, rj) => {
  321. console.log("waiting 10s for downloading pictures...", picMap.size);
  322. setTimeout(() => { rj('time out!'); }, 10000)
  323. })
  324. ]);
  325. r += '\n';
  326. for (let [key, url] of pics) {
  327. r += `[${key}]:${url}\n`;
  328. }
  329. return r;
  330. }
  331.  
  332. unsafeWindow.md = nodeToMD;
  333. // unsafeWindow.__xhr = GM_xmlhttpRequest;
  334. // unsafeWindow._getpic = getPicture
  335.  
  336. async function onClick() {
  337. let url = URL.createObjectURL(new Blob([await html2MD()], { type: 'text/plain' }));
  338. let a = document.createElement('a');
  339. a.download = `${document.title}.md`;
  340. a.href = url;
  341. document.body.append(a);
  342. a.click();
  343. a.remove();
  344. }
  345.  
  346. GM_registerMenuCommand('下载 Markdown', () => {
  347. onClick();
  348. })
  349.  
  350. })();