Virastar Library

Cleaning-up Persian Texts!

Tento skript by neměl být instalován přímo. Jedná se o knihovnu, kterou by měly jiné skripty využívat pomocí meta příkazu // @require https://update.greatest.deepsurf.us/scripts/527228/1538801/Virastar%20Library.js

  1. // ==UserScript==
  2. // @name Virastar Library
  3. // @version 0.21.0
  4. // @description Cleaning-up Persian Texts!
  5. // @homepage https://github.com/brothersincode/virastar/
  6. // @namespace amm1rr.com.virastar
  7. // @name:fa کتابخانه ویراستار
  8. // @description:fa ویراستار متنِ فارسی
  9. // @grant none
  10. // @updateURL https://raw.githubusercontent.com/brothersincode/virastar/master/lib/virastar.js
  11. // @downloadURL https://raw.githubusercontent.com/brothersincode/virastar/master/lib/virastar.js
  12. // @license MIT
  13. // ==/UserScript==
  14.  
  15. /*!
  16. * Virastar - v0.21.0 - 2020-05-14
  17. * https://github.com/brothersincode/virastar
  18. * Licensed: MIT
  19. */
  20.  
  21. (function (name, global, definition) {
  22. if (typeof module !== 'undefined') module.exports = definition();
  23. else if (typeof define === 'function' && typeof define.amd === 'object') define(definition);
  24. else if (typeof window !== 'undefined') window[name] = definition();
  25. else global[name] = definition();
  26. }('Virastar', this, function () {
  27. function Virastar (text, options) {
  28. if (!(this instanceof Virastar)) {
  29. return new Virastar(text, options);
  30. }
  31.  
  32. text = text || {};
  33.  
  34. if (typeof text === 'object') {
  35. this.opts = parseOptions(text);
  36. } else if (typeof text === 'string') {
  37. this.opts = parseOptions(options || {});
  38. return cleanup(text);
  39. }
  40.  
  41. return this;
  42. }
  43.  
  44. function parseOptions (options) {
  45. // @ref: https://scotch.io/bar-talk/copying-objects-in-javascript
  46. var parsed = Object.assign({}, defaults);
  47.  
  48. for (var i in parsed) {
  49. if (options.hasOwnProperty(i)) { // eslint-disable-line no-prototype-builtins
  50. parsed[i] = options[i];
  51. }
  52. }
  53.  
  54. return parsed;
  55. }
  56.  
  57. function charReplace (text, fromBatch, toBatch) {
  58. var fromChars = fromBatch.split('');
  59. var toChars = toBatch.split('');
  60. for (var i in fromChars) {
  61. text = text.replace(newRegExp(fromChars[i]), toChars[i]);
  62. }
  63. return text;
  64. }
  65.  
  66. function arrReplace (text, array) {
  67. for (var i in array) {
  68. if (array.hasOwnProperty(i)) { // eslint-disable-line no-prototype-builtins
  69. text = text.replace(newRegExp('[' + array[i] + ']'), i);
  70. }
  71. }
  72. return text;
  73. }
  74.  
  75. function newRegExp (pattern, flags) {
  76. return new RegExp(pattern, flags || 'g');
  77. }
  78.  
  79. var charsPersian = 'ءاآأإئؤبپتثجچحخدذرزژسشصضطظعغفقکگلمنوهیةيك';
  80.  
  81. // @REF: https://en.wikipedia.org/wiki/Persian_alphabet#Diacritics
  82. // `\u064e\u0650\u064f\u064b\u064d\u064c\u0651\u06c0`
  83. var charsDiacritic = 'ًٌٍَُِّْ';
  84.  
  85. // @source: https://github.com/jhermsmeier/uri.regex
  86. var patternURI = "([A-Za-z][A-Za-z0-9+\\-.]*):(?:(//)(?:((?:[A-Za-z0-9\\-._~!$&'()*+,;=:]|%[0-9A-Fa-f]{2})*)@)?((?:\\[(?:(?:(?:(?:[0-9A-Fa-f]{1,4}:){6}|::(?:[0-9A-Fa-f]{1,4}:){5}|(?:[0-9A-Fa-f]{1,4})?::(?:[0-9A-Fa-f]{1,4}:){4}|(?:(?:[0-9A-Fa-f]{1,4}:){0,1}[0-9A-Fa-f]{1,4})?::(?:[0-9A-Fa-f]{1,4}:){3}|(?:(?:[0-9A-Fa-f]{1,4}:){0,2}[0-9A-Fa-f]{1,4})?::(?:[0-9A-Fa-f]{1,4}:){2}|(?:(?:[0-9A-Fa-f]{1,4}:){0,3}[0-9A-Fa-f]{1,4})?::[0-9A-Fa-f]{1,4}:|(?:(?:[0-9A-Fa-f]{1,4}:){0,4}[0-9A-Fa-f]{1,4})?::)(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4}|(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))|(?:(?:[0-9A-Fa-f]{1,4}:){0,5}[0-9A-Fa-f]{1,4})?::[0-9A-Fa-f]{1,4}|(?:(?:[0-9A-Fa-f]{1,4}:){0,6}[0-9A-Fa-f]{1,4})?::)|[Vv][0-9A-Fa-f]+\\.[A-Za-z0-9\\-._~!$&'()*+,;=:]+)\\]|(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)|(?:[A-Za-z0-9\\-._~!$&'()*+,;=]|%[0-9A-Fa-f]{2})*))(?::([0-9]*))?((?:/(?:[A-Za-z0-9\\-._~!$&'()*+,;=:@]|%[0-9A-Fa-f]{2})*)*)|/((?:(?:[A-Za-z0-9\\-._~!$&'()*+,;=:@]|%[0-9A-Fa-f]{2})+(?:/(?:[A-Za-z0-9\\-._~!$&'()*+,;=:@]|%[0-9A-Fa-f]{2})*)*)?)|((?:[A-Za-z0-9\\-._~!$&'()*+,;=:@]|%[0-9A-Fa-f]{2})+(?:/(?:[A-Za-z0-9\\-._~!$&'()*+,;=:@]|%[0-9A-Fa-f]{2})*)*)|)(?:\\?((?:[A-Za-z0-9\\-._~!$&'()*+,;=:@/?]|%[0-9A-Fa-f]{2})*))?(?:\\#((?:[A-Za-z0-9\\-._~!$&'()*+,;=:@/?]|%[0-9A-Fa-f]{2})*))?";
  87. var patternAfter = '\\s.,;،؛!؟?"\'()[\\]{}“”«»';
  88.  
  89. var defaults = {
  90. // aggresive: true, // DEPRECATED
  91. cleanup_begin_and_end: true,
  92. cleanup_extra_marks: true,
  93. cleanup_kashidas: true,
  94. cleanup_line_breaks: true,
  95. cleanup_rlm: true,
  96. cleanup_spacing: true,
  97. cleanup_zwnj: true,
  98. decode_htmlentities: true,
  99. fix_arabic_numbers: true,
  100. fix_dashes: true,
  101. fix_diacritics: true,
  102. fix_english_numbers: true,
  103. fix_english_quotes_pairs: true,
  104. fix_english_quotes: true,
  105. fix_hamzeh: true,
  106. fix_hamzeh_arabic: false,
  107. fix_misc_non_persian_chars: true,
  108. fix_misc_spacing: true,
  109. fix_numeral_symbols: true,
  110. fix_perfix_spacing: true,
  111. fix_persian_glyphs: true,
  112. fix_punctuations: true,
  113. fix_question_mark: true,
  114. fix_spacing_for_braces_and_quotes: true,
  115. fix_spacing_for_punctuations: true,
  116. fix_suffix_misc: true,
  117. fix_suffix_spacing: true,
  118. fix_three_dots: true,
  119. kashidas_as_parenthetic: true,
  120. markdown_normalize_braces: true,
  121. markdown_normalize_lists: true,
  122. normalize_dates: true,
  123. normalize_ellipsis: true,
  124. normalize_eol: true,
  125. preserve_braces: false,
  126. preserve_brackets: false,
  127. preserve_comments: true,
  128. preserve_entities: true,
  129. preserve_frontmatter: true,
  130. preserve_HTML: true,
  131. preserve_nbsps: true,
  132. preserve_URIs: true,
  133. remove_diacritics: false,
  134. skip_markdown_ordered_lists_numbers_conversion: false
  135. };
  136.  
  137. var digits = '۱۲۳۴۵۶۷۸۹۰';
  138.  
  139. /* eslint-disable */
  140. var entities = {
  141. 'sbquo;': '\u201a',
  142. 'lsquo;': '\u2018',
  143. 'lsquor;': '\u201a',
  144. 'ldquo;': '\u201c',
  145. 'ldquor;': '\u201e',
  146. 'rdquo;': '\u201d',
  147. 'rdquor;': '\u201d',
  148. 'rsquo;': '\u2019',
  149. 'rsquor;': '\u2019',
  150. 'apos;': '\'',
  151. 'QUOT;': '"',
  152. 'QUOT': '"',
  153. 'quot;': '"',
  154. 'quot': '"',
  155. 'zwj;': '\u200d',
  156. 'ZWNJ;': '\u200c',
  157. 'zwnj;': '\u200c',
  158. 'shy;': '\u00ad' // wrongly used as zwnj
  159. };
  160.  
  161. // props @ebraminio/persiantools
  162. var glyphs = {
  163. // these two are for visually available ZWNJ #visualZwnj
  164. '\u200cه': 'ﻫ',
  165. 'ی\u200c': 'ﻰﻲ',
  166. 'ﺃ': 'ﺄﺃ',
  167. 'ﺁ': 'ﺁﺂ',
  168. 'ﺇ': 'ﺇﺈ',
  169. 'ا': 'ﺎا',
  170. 'ب': 'ﺏﺐﺑﺒ',
  171. 'پ': 'ﭖﭗﭘﭙ',
  172. 'ت': 'ﺕﺖﺗﺘ',
  173. 'ث': 'ﺙﺚﺛﺜ',
  174. 'ج': 'ﺝﺞﺟﺠ',
  175. 'چ': 'ﭺﭻﭼﭽ',
  176. 'ح': 'ﺡﺢﺣﺤ',
  177. 'خ': 'ﺥﺦﺧﺨ',
  178. 'د': 'ﺩﺪ',
  179. 'ذ': 'ﺫﺬ',
  180. 'ر': 'ﺭﺮ',
  181. 'ز': 'ﺯﺰ',
  182. 'ژ': 'ﮊﮋ',
  183. 'س': 'ﺱﺲﺳﺴ',
  184. 'ش': 'ﺵﺶﺷﺸ',
  185. 'ص': 'ﺹﺺﺻﺼ',
  186. 'ض': 'ﺽﺾﺿﻀ',
  187. 'ط': 'ﻁﻂﻃﻄ',
  188. 'ظ': 'ﻅﻆﻇﻈ',
  189. 'ع': 'ﻉﻊﻋﻌ',
  190. 'غ': 'ﻍﻎﻏﻐ',
  191. 'ف': 'ﻑﻒﻓﻔ',
  192. 'ق': 'ﻕﻖﻗﻘ',
  193. 'ک': 'ﮎﮏﮐﮑﻙﻚﻛﻜ',
  194. 'گ': 'ﮒﮓﮔﮕ',
  195. 'ل': 'ﻝﻞﻟﻠ',
  196. 'م': 'ﻡﻢﻣﻤ',
  197. 'ن': 'ﻥﻦﻧﻨ',
  198. 'ه': 'ﻩﻪﻫﻬ',
  199. 'هٔ': 'ﮤﮥ',
  200. 'و': 'ﻭﻮ',
  201. 'ﺅ': 'ﺅﺆ',
  202. 'ی': 'ﯼﯽﯾﯿﻯﻰﻱﻲﻳﻴ',
  203. 'ئ': 'ﺉﺊﺋﺌ',
  204. 'لا': 'ﻼ',
  205. 'ﻹ': 'ﻺ',
  206. 'ﻷ': 'ﻸ',
  207. 'ﻵ': 'ﻶ'
  208. };
  209. /* eslint-enable */
  210.  
  211. function cleanup (text, options) {
  212. if (typeof text !== 'string') {
  213. throw new TypeError('Expected a String, but received ' + typeof text);
  214. }
  215.  
  216. // dont bother if its empty or whitespace
  217. if (!text.trim()) {
  218. return text;
  219. }
  220.  
  221. var opts = options ? parseOptions(options) : this.opts;
  222.  
  223. // single space paddings around the string
  224. text = ' ' + text + ' ';
  225.  
  226. // preserves frontmatter data in the text
  227. if (opts.preserve_frontmatter) {
  228. var frontmatter = [];
  229. text = text.replace(/^ ---[\S\s]*?---\n/g, function (matched) {
  230. frontmatter.push(matched);
  231. return ' __FRONTMATTER__PRESERVER__ ';
  232. });
  233. }
  234.  
  235. // preserves all html tags in the text
  236. // @props: @wordpress/wordcount
  237. if (opts.preserve_HTML) {
  238. var html = [];
  239. text = text.replace(/<\/?[a-z][^>]*?>/gi, function (matched) {
  240. html.push(matched);
  241. return ' __HTML__PRESERVER__ ';
  242. });
  243. }
  244.  
  245. // preserves all html comments in the text
  246. // @props: @wordpress/wordcount
  247. if (opts.preserve_comments) {
  248. var comments = [];
  249. text = text.replace(/<!--[\s\S]*?-->/g, function (matched) {
  250. comments.push(matched);
  251. return ' __COMMENT__PRESERVER__ ';
  252. });
  253. }
  254.  
  255. // preserves strings inside square brackets (`[]`)
  256. if (opts.preserve_brackets) {
  257. var brackets = [];
  258. text = text.replace(/(\[.*?\])/g, function (matched) {
  259. brackets.push(matched);
  260. return ' __BRACKETS__PRESERVER__ ';
  261. });
  262. }
  263.  
  264. // preserves strings inside curly braces (`{}`)
  265. if (opts.preserve_braces) {
  266. var braces = [];
  267. text = text.replace(/(\{.*?\})/g, function (matched) {
  268. braces.push(matched);
  269. return ' __BRACES__PRESERVER__ ';
  270. });
  271. }
  272.  
  273. // preserves all uri strings in the text
  274. if (opts.preserve_URIs) {
  275. var mdlinks = [];
  276. var uris = [];
  277.  
  278. // stores markdown links separetly
  279. text = text.replace(/]\((.*?)\)/g, function (matched, link) {
  280. if (link) {
  281. mdlinks.push(link.trim());
  282. return '](__MD_LINK__PRESERVER__)'; // no padding!
  283. }
  284. return matched;
  285. });
  286.  
  287. text = text.replace(newRegExp(patternURI), function (matched) {
  288. uris.push(matched);
  289. return ' __URI__PRESERVER__ ';
  290. });
  291. }
  292.  
  293. // preserves all no-break space entities in the text
  294. if (opts.preserve_nbsps) {
  295. var nbsps = [];
  296. text = text.replace(/&nbsp;|&#160;/gi, function (matched) {
  297. nbsps.push(matched);
  298. return ' __NBSPS__PRESERVER__ ';
  299. });
  300. }
  301.  
  302. if (opts.decode_htmlentities) {
  303. text = decodeHTMLEntities(text);
  304. }
  305.  
  306. // preserves all html entities in the text
  307. // @props: @substack/node-ent
  308. if (opts.preserve_entities) {
  309. var entities = [];
  310. text = text.replace(/&(#?[^;\W]+;?)/g, function (matched) {
  311. entities.push(matched);
  312. return ' __ENTITIES__PRESERVER__ ';
  313. });
  314. }
  315.  
  316. if (opts.normalize_eol) {
  317. text = normalizeEOL(text);
  318. }
  319.  
  320. if (opts.fix_persian_glyphs) {
  321. text = fixPersianGlyphs(text);
  322. }
  323.  
  324. if (opts.fix_dashes) {
  325. text = fixDashes(text);
  326. }
  327.  
  328. if (opts.fix_three_dots) {
  329. text = fixThreeDots(text);
  330. }
  331.  
  332. if (opts.normalize_ellipsis) {
  333. text = normalizeEllipsis(text);
  334. }
  335.  
  336. if (opts.fix_english_quotes_pairs) {
  337. text = fixEnglishQuotesPairs(text);
  338. }
  339.  
  340. if (opts.fix_english_quotes) {
  341. text = fixEnglishQuotes(text);
  342. }
  343.  
  344. if (opts.fix_hamzeh) {
  345. if (opts.fix_hamzeh_arabic) {
  346. text = fixHamzehArabic(text);
  347. }
  348.  
  349. text = fixHamzeh(text);
  350. } else if (opts.fix_suffix_spacing) {
  351. if (opts.fix_hamzeh_arabic) {
  352. text = fixHamzehArabicAlt(text);
  353. }
  354.  
  355. text = fixSuffixSpacingHamzeh(text);
  356. }
  357.  
  358. if (opts.cleanup_rlm) {
  359. text = cleanupRLM(text);
  360. }
  361.  
  362. if (opts.cleanup_zwnj) {
  363. text = cleanupZWNJ(text);
  364. }
  365.  
  366. if (opts.fix_arabic_numbers) {
  367. text = fixArabicNumbers(text);
  368. }
  369.  
  370. // word tokenizer
  371. text = text.replace(/(^|\s+)([[({"'“«]?)(\S+)([\])}"'”»]?)(?=($|\s+))/g,
  372. function (matched, before, leadings, word, trailings, after) {
  373. // should not replace to persian chars in english phrases
  374. if (word.match(/[a-zA-Z\-_]{2,}/g)) {
  375. return matched;
  376. }
  377.  
  378. // should not touch sprintf directives
  379. // @source: https://stackoverflow.com/a/8915445/
  380. if (word.match(/%(?:\d+\$)?[+-]?(?:[ 0]|'.{1})?-?\d*(?:\.\d+)?[bcdeEufFgGosxX]/g)) {
  381. return matched;
  382. }
  383.  
  384. // should not touch numbers in html entities
  385. if (word.match(/&#\d+;/g)) {
  386. return matched;
  387. }
  388.  
  389. // skips converting english numbers of ordered lists in markdown
  390. if (opts.skip_markdown_ordered_lists_numbers_conversion && (matched + trailings + after).match(/(?:(?:\r?\n)|(?:\r\n?)|(?:^|\n))\d+\.\s/)) {
  391. return matched;
  392. }
  393.  
  394. if (opts.fix_english_numbers) {
  395. matched = fixEnglishNumbers(matched);
  396. }
  397.  
  398. if (opts.fix_numeral_symbols) {
  399. matched = fixNumeralSymbols(matched);
  400. }
  401.  
  402. if (opts.fix_punctuations) {
  403. matched = fixPunctuations(matched);
  404. }
  405.  
  406. if (opts.fix_misc_non_persian_chars) {
  407. matched = fixMiscNonPersianChars(matched);
  408. }
  409.  
  410. if (opts.fix_question_mark) {
  411. matched = fixQuestionMark(matched);
  412. }
  413.  
  414. return matched;
  415. }
  416. );
  417.  
  418. if (opts.normalize_dates) {
  419. text = normalizeDates(text);
  420. }
  421.  
  422. if (opts.fix_perfix_spacing) {
  423. text = fixPerfixSpacing(text);
  424. }
  425.  
  426. if (opts.fix_suffix_spacing) {
  427. text = fixSuffixSpacing(text);
  428. }
  429.  
  430. if (opts.fix_suffix_misc) {
  431. text = fixSuffixMisc(text);
  432. }
  433.  
  434. if (opts.fix_spacing_for_braces_and_quotes) {
  435. text = fixBracesSpacing(text);
  436. }
  437.  
  438. if (opts.cleanup_extra_marks) {
  439. text = cleanupExtraMarks(text);
  440. }
  441.  
  442. if (opts.fix_spacing_for_punctuations) {
  443. text = fixPunctuationSpacing(text);
  444. }
  445.  
  446. if (opts.kashidas_as_parenthetic) {
  447. text = kashidasAsParenthetic(text);
  448. }
  449.  
  450. if (opts.cleanup_kashidas) {
  451. text = cleanupKashidas(text);
  452. }
  453.  
  454. if (opts.markdown_normalize_braces) {
  455. text = markdownNormalizeBraces(text);
  456. }
  457.  
  458. if (opts.markdown_normalize_lists) {
  459. text = markdownNormalizeLists(text);
  460. }
  461.  
  462. // doing it again after `fixPunctuationSpacing()`
  463. if (opts.fix_spacing_for_braces_and_quotes) {
  464. text = fixBracesSpacingInside(text);
  465. }
  466.  
  467. if (opts.fix_misc_spacing) {
  468. text = fixMiscSpacing(text);
  469. }
  470.  
  471. if (opts.remove_diacritics) {
  472. text = removeDiacritics(text);
  473. } else if (opts.fix_diacritics) {
  474. text = fixDiacritics(text);
  475. }
  476.  
  477. if (opts.cleanup_spacing) {
  478. text = cleanupSpacing(text);
  479. }
  480.  
  481. if (opts.cleanup_zwnj) {
  482. text = cleanupZWNJLate(text);
  483. }
  484.  
  485. if (opts.cleanup_line_breaks) {
  486. text = cleanupLineBreaks(text);
  487. }
  488.  
  489. // bringing back entities
  490. if (opts.preserve_entities) {
  491. text = text.replace(/[ ]?__ENTITIES__PRESERVER__[ ]?/g, function () {
  492. return entities.shift();
  493. });
  494. }
  495.  
  496. // bringing back nbsps
  497. if (opts.preserve_nbsps) {
  498. text = text.replace(/[ ]?__NBSPS__PRESERVER__[ ]?/g, function () {
  499. return nbsps.shift();
  500. });
  501. }
  502.  
  503. // bringing back URIs
  504. if (opts.preserve_URIs) {
  505. // no padding!
  506. text = text.replace(/__MD_LINK__PRESERVER__/g, function () {
  507. return mdlinks.shift();
  508. });
  509.  
  510. text = text.replace(/[ ]?__URI__PRESERVER__[ ]?/g, function () {
  511. return uris.shift();
  512. });
  513. }
  514.  
  515. // bringing back braces
  516. if (opts.preserve_braces) {
  517. text = text.replace(/[ ]?__BRACES__PRESERVER__[ ]?/g, function () {
  518. return braces.shift();
  519. });
  520. }
  521.  
  522. // bringing back brackets
  523. if (opts.preserve_brackets) {
  524. text = text.replace(/[ ]?__BRACKETS__PRESERVER__[ ]?/g, function () {
  525. return brackets.shift();
  526. });
  527. }
  528.  
  529. // bringing back HTML comments
  530. if (opts.preserve_comments) {
  531. text = text.replace(/[ ]?__COMMENT__PRESERVER__[ ]?/g, function () {
  532. return comments.shift();
  533. });
  534. }
  535.  
  536. // bringing back HTML tags
  537. if (opts.preserve_HTML) {
  538. text = text.replace(/[ ]?__HTML__PRESERVER__[ ]?/g, function () {
  539. return html.shift();
  540. });
  541. }
  542.  
  543. // bringing back frontmatter
  544. if (opts.preserve_frontmatter) {
  545. text = text.replace(/[ ]?__FRONTMATTER__PRESERVER__[ ]?/g, function () {
  546. return frontmatter.shift();
  547. });
  548. }
  549.  
  550. if (opts.cleanup_begin_and_end) {
  551. text = cleanupBeginAndEnd(text);
  552. } else {
  553. // removes single space paddings around the string
  554. text = text.replace(/^[ ]/g, '').replace(/[ ]$/g, '');
  555. }
  556.  
  557. return text;
  558. }
  559.  
  560. // props @ebraminio/persiantools
  561. function cleanupZWNJ (text) {
  562. return text
  563.  
  564. // converts all soft hyphens (&shy;) into zwnj
  565. .replace(/\u00ad/g, '\u200c')
  566.  
  567. // removes more than one zwnj
  568. .replace(/\u200c{2,}/g, '\u200c')
  569.  
  570. // cleans zwnj before and after numbers, english words, spaces and punctuations
  571. .replace(/\u200c([\w\s0-9۰-۹[\](){}«»“”.…,:;?!$%@#*=+\-/\\،؛٫٬×٪؟ـ])/g, '$1')
  572. .replace(/([\w\s0-9۰-۹[\](){}«»“”.…,:;?!$%@#*=+\-/\\،؛٫٬×٪؟ـ])\u200c/g, '$1')
  573.  
  574. // removes unnecessary zwnj on start/end of each line
  575. .replace(/(^\u200c|\u200c$)/gm, '')
  576. ;
  577. }
  578.  
  579. // late checks for zwnjs
  580. function cleanupZWNJLate (text) {
  581. return text
  582.  
  583. // cleans zwnj after characters that don't conncet to the next
  584. .replace(/([إأةؤورزژاآدذ،؛,:«»\\/@#$٪×*()ـ\-=|])\u200c/g, '$1')
  585. ;
  586. }
  587.  
  588. // converts numeral and selected html character-sets into original characters
  589. // @props: @substack/node-ent
  590. function decodeHTMLEntities (text) {
  591. return text.replace(/&(#?[^;\W]+;?)/g, function (matched, match) {
  592. var n;
  593. if ((n = /^#(\d+);?$/.exec(match))) {
  594. return String.fromCharCode(parseInt(n[1], 10));
  595. } else if ((n = /^#[Xx]([A-Fa-f0-9]+);?/.exec(match))) {
  596. return String.fromCharCode(parseInt(n[1], 16));
  597. } else {
  598. var hasSemi = /;$/.test(match);
  599. var withoutSemi = hasSemi ? match.replace(/;$/, '') : match;
  600. var target = entities[withoutSemi] || (hasSemi && entities[match]);
  601.  
  602. if (typeof target === 'number') {
  603. return String.fromCharCode(target);
  604. } else if (typeof target === 'string') {
  605. return target;
  606. } else {
  607. return '&' + match;
  608. }
  609. }
  610. });
  611. }
  612.  
  613. function normalizeEOL (text) {
  614. return text
  615.  
  616. // replaces windows end of lines with unix eol (`\n`)
  617. .replace(/(\r?\n)|(\r\n?)/g, '\n')
  618. ;
  619. }
  620.  
  621. function fixDashes (text) {
  622. return text
  623.  
  624. // replaces triple dash to mdash
  625. .replace(/-{3}/g, '—')
  626.  
  627. // replaces double dash to ndash
  628. .replace(/-{2}/g, '–')
  629. ;
  630. }
  631.  
  632. function fixThreeDots (text) {
  633. return text
  634.  
  635. // removes spaces between dots
  636. .replace(/\.([ ]+)(?=[.])/g, '.')
  637.  
  638. // replaces three dots with ellipsis character
  639. .replace(/[ \t]*\.{3,}/g, '…')
  640. ;
  641. }
  642.  
  643. function normalizeEllipsis (text) {
  644. return text
  645.  
  646. // replaces more than one ellipsis with one
  647. .replace(/(…){2,}/g, '…')
  648.  
  649. // replaces (space|tab|zwnj) after ellipsis with one space
  650. // NOTE: allows for space before ellipsis
  651. .replace(/([ ]{1,})*…[ \t\u200c]*/g, '$1… ')
  652. ;
  653. }
  654.  
  655. function fixEnglishQuotesPairs (text) {
  656. return text
  657.  
  658. // replaces english quote pairs with their persian equivalent
  659. .replace(/(“)(.+?)(”)/g, '«$2»')
  660. ;
  661. }
  662.  
  663. // replaces english quote marks with their persian equivalent
  664. function fixEnglishQuotes (text) {
  665. return text
  666. .replace(/(["'`]+)(.+?)(\1)/g, '«$2»')
  667. ;
  668. }
  669.  
  670. function fixHamzeh (text) {
  671. var replacement = '$1هٔ$3';
  672. return text
  673.  
  674. // replaces ه followed by (space|ZWNJ|lrm) follow by ی with هٔ
  675. .replace(/(\S)(ه[\s\u200c\u200e]+[یي])([\s\u200c\u200e])/g, replacement) // heh + ye
  676.  
  677. // replaces ه followed by (space|ZWNJ|lrm|nothing) follow by ء with هٔ
  678. .replace(/(\S)(ه[\s\u200c\u200e]?\u0621)([\s\u200c\u200e])/g, replacement) // heh + standalone hamza
  679.  
  680. // replaces هٓ or single-character ۀ with the standard هٔ
  681. // props @ebraminio/persiantools
  682. .replace(/(ۀ|هٓ)/g, 'هٔ')
  683. ;
  684. }
  685.  
  686. function fixHamzehArabic (text) {
  687. return text
  688.  
  689. // converts arabic hamzeh ة to هٔ
  690. .replace(/(\S)ة([\s\u200c\u200e])/g, '$1هٔ$2')
  691. ;
  692. }
  693.  
  694. function fixHamzehArabicAlt (text) {
  695. return text
  696. // converts arabic hamzeh ة to ه‌ی
  697. .replace(/(\S)ة([\s\u200c\u200e])/g, '$1ه‌ی$2')
  698. ;
  699. }
  700.  
  701. function cleanupRLM (text) {
  702. return text
  703. // converts Right-to-left marks followed by persian characters to
  704. // zero-width non-joiners (ZWNJ)
  705. .replace(/([^a-zA-Z\-_])(\u200F)/g, '$1\u200c')
  706. ;
  707. }
  708.  
  709. // converts incorrect persian glyphs to standard characters
  710. function fixPersianGlyphs (text) {
  711. return arrReplace(text, glyphs);
  712. }
  713.  
  714. // props @ebraminio/persiantools
  715. function fixMiscNonPersianChars (text) {
  716. return charReplace(text, 'كڪيىۍېہە', 'ککییییههه');
  717. // return text
  718. // .replace(/ك/g, 'ک') // arabic kaf
  719. // .replace(/ڪ/g, 'ک') // arabic letter swash kaf
  720. // .replace(/ي/g, 'ی') // arabic
  721. // .replace(/ى/g, 'ی') // urdu
  722. // .replace(/ۍ/g, 'ی') // pushtu
  723. // .replace(/ې/g, 'ی') // uyghur
  724. // .replace(/ہ/g, 'ه') // converts &#x06C1; to &#x0647; ہہہہ to ههه
  725. // .replace(/[ەھ]/g, 'ه'); // kurdish
  726. }
  727.  
  728. // replaces english numbers with their persian equivalent
  729. function fixEnglishNumbers (text) {
  730. return charReplace(text, '1234567890', digits);
  731. }
  732.  
  733. // replaces arabic numbers with their persian equivalent
  734. function fixArabicNumbers (text) {
  735. return charReplace(text, '١٢٣٤٥٦٧٨٩٠', digits);
  736. }
  737.  
  738. // @REF: https://github.com/shkarimpour/pholiday/pull/5/files
  739. function convertPersianNumbers (text) {
  740. return text.replace(/[\u0660-\u0669\u06f0-\u06f9]/g, function (char) {
  741. return char.charCodeAt(0) & 0xf;
  742. });
  743. }
  744.  
  745. function fixNumeralSymbols (text) {
  746. return text
  747.  
  748. // replaces english percent signs (U+066A)
  749. // props @ebraminio/persiantools
  750. .replace(/([۰-۹]) ?%/g, '$1٪')
  751.  
  752. // replaces dots between numbers into decimal separator (U+066B)
  753. // props @ebraminio/persiantools
  754. .replace(/([۰-۹])\.(?=[۰-۹])/g, '$1٫')
  755.  
  756. // replaces commas between numbers into thousands separator (U+066C)
  757. // props @languagetool-org
  758. .replace(/([۰-۹]),(?=[۰-۹])/g, '$1٬')
  759. ;
  760. }
  761.  
  762. function normalizeDates (text) {
  763. return text
  764.  
  765. // re-orders date parts with slash as delimiter
  766. .replace(/([0-9۰-۹]{1,2})([/-])([0-9۰-۹]{1,2})\2([0-9۰-۹]{4})/g, function (matched, day, delimiter, month, year) {
  767. return year + '/' + month + '/' + day;
  768. })
  769. ;
  770. }
  771.  
  772. function fixPunctuations (text) {
  773. return charReplace(text, ',;', '،؛');
  774. }
  775.  
  776. // replaces question marks with its persian equivalent
  777. function fixQuestionMark (text) {
  778. return text
  779. .replace(/(\?)/g, '\u061F') // \u061F = ؟
  780. ;
  781. }
  782.  
  783. // puts zwnj between the word and the prefix:
  784. // - mi* nemi* bi*
  785. // NOTE: there's a possible bug here: prefixes could be separate nouns
  786. function fixPerfixSpacing (text) {
  787. var replacement = '$1\u200c$3';
  788. return text
  789. .replace(/((\s|^)ن?می) ([^ ])/g, replacement)
  790. .replace(/((\s|^)بی) ([^ ])/g, replacement) // props @zoghal
  791. ;
  792. }
  793.  
  794. // puts zwnj between the word and the suffix
  795. // NOTE: possible bug: suffixes could be nouns
  796. function fixSuffixSpacing (text) {
  797. var replacement = '$1\u200c$2';
  798. return text
  799.  
  800. // must done before others
  801. // *ha *haye
  802. .replace(newRegExp('([' + charsPersian + charsDiacritic + ']) (ها(ی)?[' + patternAfter + '])'), replacement)
  803.  
  804. // *am *at *ash *ei *eid *eem *and *man *tan *shan
  805. .replace(newRegExp('([' + charsPersian + charsDiacritic + ']) ((ام|ات|اش|ای|اید|ایم|اند|مان|تان|شان)[' + patternAfter + '])'), replacement)
  806.  
  807. // *tar *tari *tarin
  808. .replace(newRegExp('([' + charsPersian + charsDiacritic + ']) (تر((ی)|(ین))?[' + patternAfter + '])'), replacement)
  809.  
  810. // *hayee *hayam *hayat *hayash *hayetan *hayeman *hayeshan
  811. .replace(newRegExp('([' + charsPersian + charsDiacritic + ']) ((هایی|هایم|هایت|هایش|هایمان|هایتان|هایشان)[' + patternAfter + '])'), replacement)
  812. ;
  813. }
  814.  
  815. function fixSuffixSpacingHamzeh (text) {
  816. var replacement = '$1\u0647\u200c\u06cc$3';
  817. return text
  818.  
  819. // heh + ye
  820. .replace(/(\S)(ه[\s\u200c]+[یي])([\s\u200c])/g, replacement)
  821.  
  822. // heh + standalone hamza
  823. .replace(/(\S)(ه[\s\u200c]?\u0621)([\s\u200c])/g, replacement)
  824.  
  825. // heh + hamza above
  826. .replace(/(\S)(ه[\s\u200c]?\u0654)([\s\u200c])/g, replacement)
  827. ;
  828. }
  829.  
  830. function fixSuffixMisc (text) {
  831. return text
  832. // replaces ه followed by ئ or ی, and then by ی, with ه\u200cای,
  833. // EXAMPLE: خانه‌ئی becomes خانه‌ای
  834. // props @ebraminio/persiantools
  835. .replace(/(\S)ه[\u200c\u200e][ئی]ی([\s\u200c\u200e])/g, '$1ه\u200cای$2')
  836. ;
  837. }
  838.  
  839. function cleanupExtraMarks (text) {
  840. return text
  841.  
  842. // removes space between different/same marks (combining for cleanup)
  843. .replace(/([؟?!])([ ]+)(?=[؟?!])/g, '$1')
  844.  
  845. // replaces more than one exclamation mark with just one
  846. .replace(/(!){2,}/g, '$1')
  847. // replaces more than one english or persian question mark with just one
  848. .replace(/(\u061F|\?){2,}/g, '$1') // \u061F = `؟`
  849. // re-orders consecutive marks
  850. .replace(/(!)([ \t]*)([\u061F?])/g, '$3$1') // `?!` --> `!?`
  851. ;
  852. }
  853.  
  854. // replaces kashidas to ndash in parenthetic
  855. function kashidasAsParenthetic (text) {
  856. return text
  857. .replace(/(\s)\u0640+/g, '$1–')
  858. .replace(/\u0640+(\s)/g, '–$1')
  859. ;
  860. }
  861.  
  862. function cleanupKashidas (text) {
  863. return text
  864. // converts kashida between numbers to ndash
  865. .replace(/([0-9۰-۹]+)ـ+([0-9۰-۹]+)/g, '$1–$2')
  866.  
  867. // removes all kashidas between non-whitespace characters
  868. // MAYBE: more punctuations
  869. .replace(/([^\s.])\u0640+(?![\s.])/g, '$1')
  870. ;
  871. }
  872.  
  873. function fixPunctuationSpacing (text) {
  874. return text
  875. // removes space before punctuations
  876. .replace(/[ \t\u200c]*([:;,؛،.؟?!]{1})/g, '$1')
  877.  
  878. // removes more than one space after punctuations
  879. // except followed by new-lines (or preservers)
  880. .replace(/([:;,؛،.؟?!]{1})[ \t\u200c]*(?!\n|_{2})/g, '$1 ')
  881.  
  882. // removes space after colon that separates time parts
  883. .replace(/([0-9۰-۹]+):\s+([0-9۰-۹]+)/g, '$1:$2')
  884.  
  885. // removes space after dots in numbers
  886. .replace(/([0-9۰-۹]+)\. ([0-9۰-۹]+)/g, '$1.$2')
  887.  
  888. // removes space before common domain tlds
  889. .replace(/([\w\-_]+)\. (ir|com|org|net|info|edu|me)([\s/\\\])»:;.])/g, '$1.$2$3')
  890.  
  891. // removes space between different/same marks (double-check)
  892. .replace(/([؟?!])([ ]+)(?=[؟?!])/g, '$1')
  893. ;
  894. }
  895.  
  896. function fixBracesSpacing (text) {
  897. var replacement = ' $1$2$3 ';
  898. return text
  899. // removes inside spaces and more than one outside
  900. // for `()`, `[]`, `{}`, `“”` and `«»`
  901. .replace(/[ \t\u200c]*(\()\s*([^)]+?)\s*?(\))[ \t\u200c]*/g, replacement)
  902. .replace(/[ \t\u200c]*(\[)\s*([^\]]+?)\s*?(\])[ \t\u200c]*/g, replacement)
  903. .replace(/[ \t\u200c]*(\{)\s*([^}]+?)\s*?(\})[ \t\u200c]*/g, replacement)
  904. .replace(/[ \t\u200c]*(“)\s*([^”]+?)\s*?(”)[ \t\u200c]*/g, replacement)
  905. .replace(/[ \t\u200c]*(«)\s*([^»]+?)\s*?(»)[ \t\u200c]*/g, replacement)
  906. ;
  907. }
  908.  
  909. function fixBracesSpacingInside (text) {
  910. var replacement = '$1$2$3';
  911. return text
  912. // removes inside spaces for `()`, `[]`, `{}`, `“”` and `«»`
  913. .replace(/(\()\s*([^)]+?)\s*?(\))/g, replacement)
  914. .replace(/(\[)\s*([^\]]+?)\s*?(\])/g, replacement)
  915. .replace(/(\{)\s*([^}]+?)\s*?(\})/g, replacement)
  916. .replace(/(“)\s*([^”]+?)\s*?(”)/g, replacement)
  917. .replace(/(«)\s*([^»]+?)\s*?(»)/g, replacement)
  918.  
  919. // NOTE: must be here, wierd not working if on `markdownNormalizeBraces()`
  920. // removes markdown link spaces inside normal ()
  921. .replace(/(\(\[.*?\]\(.*?\))\s+(\))/g, '$1$2')
  922. ;
  923. }
  924.  
  925. function markdownNormalizeBraces (text) {
  926. return text
  927. // removes space between ! and opening brace on markdown images
  928. // EXAMPLE: `! [alt] (src)` --> `![alt](src)`
  929. .replace(/! (\[.*?\])[ ]?(\(.*?\))[ ]?/g, '!$1$2')
  930.  
  931. // removes spaces between [] and ()
  932. // EXAMPLE: `[text] (link)` --> `[text](link)`
  933. .replace(/(\[.*?\])[ \t]+(\(.*?\))/g, '$1$2')
  934.  
  935. // removes spaces inside double () [] {}
  936. // EXAMPLE: `[[ text ]]` --> `[[text]]`
  937. .replace(/\(\([ \t]*(.*?)[ \t]*\)\)/g, '(($1))')
  938. .replace(/\[\[[ \t]*(.*?)[ \t]*\]\]/g, '[[$1]]')
  939. .replace(/\{\{[ \t]*(.*?)[ \t]*\}\}/g, '{{$1}}')
  940. .replace(/\{\{\{[ \t]*(.*?)[ \t]*\}\}\}/g, '{{{$1}}}') // mustache escape
  941.  
  942. // removes spaces between double () [] {}
  943. // EXAMPLE: `[[text] ]` --> `[[text]]`
  944. .replace(/(\(\(.*\))[ \t]+(\))/g, '$1$2')
  945. .replace(/(\[\[.*\])[ \t]+(\])/g, '$1$2')
  946. .replace(/(\{\{.*\})[ \t]+(\})/g, '$1$2')
  947. ;
  948. }
  949.  
  950. function markdownNormalizeLists (text) {
  951. return text
  952. // removes extra line between two items list
  953. .replace(/((\n|^)\*.*?)\n+(?=\n\*)/g, '$1')
  954. .replace(/((\n|^)-.*?)\n+(?=\n-)/g, '$1')
  955. .replace(/((\n|^)#.*?)\n+(?=\n#)/g, '$1')
  956. ;
  957. }
  958.  
  959. function fixMiscSpacing (text) {
  960. return text
  961.  
  962. // removes space before parentheses on misc cases
  963. .replace(/ \((ص|عج|س|ع|ره)\)/g, '($1)')
  964.  
  965. // removes space before braces containing numbers
  966. .replace(/ \[([0-9۰-۹]+)\]/g, '[$1]')
  967. ;
  968. }
  969.  
  970. function fixDiacritics (text) {
  971. return text
  972. // cleans zwnj before diacritic characters
  973. .replace(newRegExp('\u200c([' + charsDiacritic + '])'), '$1')
  974.  
  975. // cleans more than one diacritic characters
  976. // props @languagetool-org
  977. .replace(newRegExp('(.*)([' + charsDiacritic + ']){2,}(.*)'), '$1$2$3')
  978.  
  979. // cleans spaces before diacritic characters
  980. .replace(newRegExp('(\\S)[ ]+([' + charsDiacritic + '])'), '$1$2')
  981. ;
  982. }
  983.  
  984. function removeDiacritics (text) {
  985. return text
  986.  
  987. // removes all diacritic characters
  988. .replace(newRegExp('[' + charsDiacritic + ']+'), '')
  989. ;
  990. }
  991.  
  992. function cleanupSpacing (text) {
  993. return text
  994.  
  995. // replaces more than one space with just a single one
  996. // except before/after preservers and before new-lines
  997. // .replace(/(?<![_]{2})([ ]{2,})(?![_]{2}|\n)/g, ' ') // WORKS: using lookbehind
  998. .replace(/([^_])([ ]{2,})(?![_]{2}|\n)/g, '$1 ')
  999.  
  1000. // cleans tab/space/zwnj/zwj/nbsp between two new-lines(\n)
  1001. // @REF: https://stackoverflow.com/a/10965543/
  1002. .replace(/^\n([\t\u0020\u200c\u200d\u00a0]*)\n$/gm, '\n\n')
  1003. }
  1004.  
  1005. function cleanupLineBreaks (text) {
  1006. return text
  1007.  
  1008. // cleans more than two contiguous line-breaks
  1009. .replace(/\n{2,}/g, '\n\n')
  1010. ;
  1011. }
  1012.  
  1013. function cleanupBeginAndEnd (text) {
  1014. return text
  1015.  
  1016. // removes space/tab/zwnj/nbsp from the beginning of the new-lines
  1017. .replace(/([\n]+)[ \t\u200c\u00a0]*/g, '$1')
  1018.  
  1019. // removes spaces, tabs, zwnj, direction marks and new lines from
  1020. // the beginning and end of text
  1021. // @REF: http://stackoverflow.com/a/38490203
  1022. .replace(/^[\s\u200c\u200e\u200f]+|[\s\u200c\u200e\u200f]+$/g, '')
  1023. ;
  1024. }
  1025.  
  1026. function flipPunctuations (text) {
  1027. var end = ['-'];
  1028. var start = ['!', '.', '،', '…', '"'];
  1029. var before = [];
  1030. var after = [];
  1031.  
  1032. text = fixThreeDots(text);
  1033.  
  1034. for (var iStart = 0; iStart < start.length; iStart++) {
  1035. var sElement = start[iStart];
  1036. var sReg = newRegExp('^\\' + sElement, 'i');
  1037. if (sReg.test(text)) {
  1038. text = text.replace(sReg, '').trim();
  1039. after.push(sElement);
  1040. }
  1041. }
  1042.  
  1043. for (var iEnd = 0; iEnd < end.length; iEnd++) {
  1044. var eElement = end[iEnd];
  1045. var eReg = newRegExp('\\' + eElement + '$', 'i');
  1046. if (eReg.test(text)) {
  1047. text = text.replace(eReg, '').trim();
  1048. before.push(eElement);
  1049. }
  1050. }
  1051.  
  1052. for (var iBefore = 0; iBefore < before.length; iBefore++) {
  1053. text = before[iBefore] + ' ' + text;
  1054. }
  1055.  
  1056. for (var iAfter = 0; iAfter < after.length; iAfter++) {
  1057. text += after[iAfter];
  1058. }
  1059.  
  1060. return normalizeEllipsis(text);
  1061. }
  1062.  
  1063. // swap incorrect quotes pairs `»«` to `«»` and `”“` to `“”`
  1064. function swapQuotes (text) {
  1065. return text
  1066. .replace(/(»)(.+?)(«)/g, '«$2»')
  1067. .replace(/(”)(.+?)(“)/g, '“$2”')
  1068. ;
  1069. }
  1070.  
  1071. Virastar.prototype = {
  1072.  
  1073. // public methods
  1074. defaults: defaults,
  1075. cleanup: cleanup,
  1076.  
  1077. // internal methods
  1078. // cleanupZWNJ: cleanupZWNJ,
  1079. // cleanupZWNJLate: cleanupZWNJLate,
  1080. // decodeHTMLEntities: decodeHTMLEntities,
  1081. // normalizeEOL: normalizeEOL,
  1082. // fixDashes: fixDashes,
  1083. // fixThreeDots: fixThreeDots,
  1084. // normalizeEllipsis: normalizeEllipsis,
  1085. // fixEnglishQuotesPairs: fixEnglishQuotesPairs,
  1086. // fixEnglishQuotes: fixEnglishQuotes,
  1087. // fixHamzeh: fixHamzeh,
  1088. // fixHamzehArabic: fixHamzehArabic,
  1089. // fixHamzehArabicAlt: fixHamzehArabicAlt,
  1090. // cleanupRLM: cleanupRLM,
  1091. // fixPersianGlyphs: fixPersianGlyphs,
  1092. // fixMiscNonPersianChars: fixMiscNonPersianChars,
  1093. // fixEnglishNumbers: fixEnglishNumbers,
  1094. // fixArabicNumbers: fixArabicNumbers,
  1095. // fixNumeralSymbols: fixNumeralSymbols,
  1096. // fixPunctuations: fixPunctuations,
  1097. // fixQuestionMark: fixQuestionMark,
  1098. // fixPerfixSpacing: fixPerfixSpacing,
  1099. // fixSuffixSpacing: fixSuffixSpacing,
  1100. // fixSuffixSpacingHamzeh: fixSuffixSpacingHamzeh,
  1101. // fixSuffixMisc: fixSuffixMisc,
  1102. // cleanupExtraMarks: cleanupExtraMarks,
  1103. // kashidasAsParenthetic: kashidasAsParenthetic,
  1104. // cleanupKashidas: cleanupKashidas,
  1105. // fixPunctuationSpacing: fixPunctuationSpacing,
  1106. // fixBracesSpacing: fixBracesSpacing,
  1107. // fixBracesSpacingInside: fixBracesSpacingInside,
  1108. // markdownNormalizeBraces: markdownNormalizeBraces,
  1109. // markdownNormalizeLists: markdownNormalizeLists,
  1110. // fixDiacritics: fixDiacritics,
  1111. // cleanupSpacing: cleanupSpacing,
  1112. // cleanupLineBreaks: cleanupLineBreaks,
  1113. // cleanupBeginAndEnd: cleanupBeginAndEnd,
  1114.  
  1115. // extra methods
  1116. convertPersianNumbers: convertPersianNumbers,
  1117. flipPunctuations: flipPunctuations,
  1118. swapQuotes: swapQuotes
  1119. };
  1120.  
  1121. return Virastar;
  1122. }));