Tento skript by neměl být instalován přímo. Jedná se o knihovnu, kterou by měly jiné skripty využívat pomocí meta příkazu // @require https://update.greatest.deepsurf.us/scripts/527228/1538801/Virastar%20Library.js
- // ==UserScript==
- // @name Virastar Library
- // @version 0.21.0
- // @description Cleaning-up Persian Texts!
- // @homepage https://github.com/brothersincode/virastar/
- // @namespace amm1rr.com.virastar
- // @name:fa کتابخانه ویراستار
- // @description:fa ویراستار متنِ فارسی
- // @grant none
- // @updateURL https://raw.githubusercontent.com/brothersincode/virastar/master/lib/virastar.js
- // @downloadURL https://raw.githubusercontent.com/brothersincode/virastar/master/lib/virastar.js
- // @license MIT
- // ==/UserScript==
-
- /*!
- * Virastar - v0.21.0 - 2020-05-14
- * https://github.com/brothersincode/virastar
- * Licensed: MIT
- */
-
- (function (name, global, definition) {
- if (typeof module !== 'undefined') module.exports = definition();
- else if (typeof define === 'function' && typeof define.amd === 'object') define(definition);
- else if (typeof window !== 'undefined') window[name] = definition();
- else global[name] = definition();
- }('Virastar', this, function () {
- function Virastar (text, options) {
- if (!(this instanceof Virastar)) {
- return new Virastar(text, options);
- }
-
- text = text || {};
-
- if (typeof text === 'object') {
- this.opts = parseOptions(text);
- } else if (typeof text === 'string') {
- this.opts = parseOptions(options || {});
- return cleanup(text);
- }
-
- return this;
- }
-
- function parseOptions (options) {
- // @ref: https://scotch.io/bar-talk/copying-objects-in-javascript
- var parsed = Object.assign({}, defaults);
-
- for (var i in parsed) {
- if (options.hasOwnProperty(i)) { // eslint-disable-line no-prototype-builtins
- parsed[i] = options[i];
- }
- }
-
- return parsed;
- }
-
- function charReplace (text, fromBatch, toBatch) {
- var fromChars = fromBatch.split('');
- var toChars = toBatch.split('');
- for (var i in fromChars) {
- text = text.replace(newRegExp(fromChars[i]), toChars[i]);
- }
- return text;
- }
-
- function arrReplace (text, array) {
- for (var i in array) {
- if (array.hasOwnProperty(i)) { // eslint-disable-line no-prototype-builtins
- text = text.replace(newRegExp('[' + array[i] + ']'), i);
- }
- }
- return text;
- }
-
- function newRegExp (pattern, flags) {
- return new RegExp(pattern, flags || 'g');
- }
-
- var charsPersian = 'ءاآأإئؤبپتثجچحخدذرزژسشصضطظعغفقکگلمنوهیةيك';
-
- // @REF: https://en.wikipedia.org/wiki/Persian_alphabet#Diacritics
- // `\u064e\u0650\u064f\u064b\u064d\u064c\u0651\u06c0`
- var charsDiacritic = 'ًٌٍَُِّْ';
-
- // @source: https://github.com/jhermsmeier/uri.regex
- var patternURI = "([A-Za-z][A-Za-z0-9+\\-.]*):(?:(//)(?:((?:[A-Za-z0-9\\-._~!$&'()*+,;=:]|%[0-9A-Fa-f]{2})*)@)?((?:\\[(?:(?:(?:(?:[0-9A-Fa-f]{1,4}:){6}|::(?:[0-9A-Fa-f]{1,4}:){5}|(?:[0-9A-Fa-f]{1,4})?::(?:[0-9A-Fa-f]{1,4}:){4}|(?:(?:[0-9A-Fa-f]{1,4}:){0,1}[0-9A-Fa-f]{1,4})?::(?:[0-9A-Fa-f]{1,4}:){3}|(?:(?:[0-9A-Fa-f]{1,4}:){0,2}[0-9A-Fa-f]{1,4})?::(?:[0-9A-Fa-f]{1,4}:){2}|(?:(?:[0-9A-Fa-f]{1,4}:){0,3}[0-9A-Fa-f]{1,4})?::[0-9A-Fa-f]{1,4}:|(?:(?:[0-9A-Fa-f]{1,4}:){0,4}[0-9A-Fa-f]{1,4})?::)(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4}|(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))|(?:(?:[0-9A-Fa-f]{1,4}:){0,5}[0-9A-Fa-f]{1,4})?::[0-9A-Fa-f]{1,4}|(?:(?:[0-9A-Fa-f]{1,4}:){0,6}[0-9A-Fa-f]{1,4})?::)|[Vv][0-9A-Fa-f]+\\.[A-Za-z0-9\\-._~!$&'()*+,;=:]+)\\]|(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)|(?:[A-Za-z0-9\\-._~!$&'()*+,;=]|%[0-9A-Fa-f]{2})*))(?::([0-9]*))?((?:/(?:[A-Za-z0-9\\-._~!$&'()*+,;=:@]|%[0-9A-Fa-f]{2})*)*)|/((?:(?:[A-Za-z0-9\\-._~!$&'()*+,;=:@]|%[0-9A-Fa-f]{2})+(?:/(?:[A-Za-z0-9\\-._~!$&'()*+,;=:@]|%[0-9A-Fa-f]{2})*)*)?)|((?:[A-Za-z0-9\\-._~!$&'()*+,;=:@]|%[0-9A-Fa-f]{2})+(?:/(?:[A-Za-z0-9\\-._~!$&'()*+,;=:@]|%[0-9A-Fa-f]{2})*)*)|)(?:\\?((?:[A-Za-z0-9\\-._~!$&'()*+,;=:@/?]|%[0-9A-Fa-f]{2})*))?(?:\\#((?:[A-Za-z0-9\\-._~!$&'()*+,;=:@/?]|%[0-9A-Fa-f]{2})*))?";
- var patternAfter = '\\s.,;،؛!؟?"\'()[\\]{}“”«»';
-
- var defaults = {
- // aggresive: true, // DEPRECATED
- cleanup_begin_and_end: true,
- cleanup_extra_marks: true,
- cleanup_kashidas: true,
- cleanup_line_breaks: true,
- cleanup_rlm: true,
- cleanup_spacing: true,
- cleanup_zwnj: true,
- decode_htmlentities: true,
- fix_arabic_numbers: true,
- fix_dashes: true,
- fix_diacritics: true,
- fix_english_numbers: true,
- fix_english_quotes_pairs: true,
- fix_english_quotes: true,
- fix_hamzeh: true,
- fix_hamzeh_arabic: false,
- fix_misc_non_persian_chars: true,
- fix_misc_spacing: true,
- fix_numeral_symbols: true,
- fix_perfix_spacing: true,
- fix_persian_glyphs: true,
- fix_punctuations: true,
- fix_question_mark: true,
- fix_spacing_for_braces_and_quotes: true,
- fix_spacing_for_punctuations: true,
- fix_suffix_misc: true,
- fix_suffix_spacing: true,
- fix_three_dots: true,
- kashidas_as_parenthetic: true,
- markdown_normalize_braces: true,
- markdown_normalize_lists: true,
- normalize_dates: true,
- normalize_ellipsis: true,
- normalize_eol: true,
- preserve_braces: false,
- preserve_brackets: false,
- preserve_comments: true,
- preserve_entities: true,
- preserve_frontmatter: true,
- preserve_HTML: true,
- preserve_nbsps: true,
- preserve_URIs: true,
- remove_diacritics: false,
- skip_markdown_ordered_lists_numbers_conversion: false
- };
-
- var digits = '۱۲۳۴۵۶۷۸۹۰';
-
- /* eslint-disable */
- var entities = {
- 'sbquo;': '\u201a',
- 'lsquo;': '\u2018',
- 'lsquor;': '\u201a',
- 'ldquo;': '\u201c',
- 'ldquor;': '\u201e',
- 'rdquo;': '\u201d',
- 'rdquor;': '\u201d',
- 'rsquo;': '\u2019',
- 'rsquor;': '\u2019',
- 'apos;': '\'',
- 'QUOT;': '"',
- 'QUOT': '"',
- 'quot;': '"',
- 'quot': '"',
- 'zwj;': '\u200d',
- 'ZWNJ;': '\u200c',
- 'zwnj;': '\u200c',
- 'shy;': '\u00ad' // wrongly used as zwnj
- };
-
- // props @ebraminio/persiantools
- var glyphs = {
- // these two are for visually available ZWNJ #visualZwnj
- '\u200cه': 'ﻫ',
- 'ی\u200c': 'ﻰﻲ',
- 'ﺃ': 'ﺄﺃ',
- 'ﺁ': 'ﺁﺂ',
- 'ﺇ': 'ﺇﺈ',
- 'ا': 'ﺎا',
- 'ب': 'ﺏﺐﺑﺒ',
- 'پ': 'ﭖﭗﭘﭙ',
- 'ت': 'ﺕﺖﺗﺘ',
- 'ث': 'ﺙﺚﺛﺜ',
- 'ج': 'ﺝﺞﺟﺠ',
- 'چ': 'ﭺﭻﭼﭽ',
- 'ح': 'ﺡﺢﺣﺤ',
- 'خ': 'ﺥﺦﺧﺨ',
- 'د': 'ﺩﺪ',
- 'ذ': 'ﺫﺬ',
- 'ر': 'ﺭﺮ',
- 'ز': 'ﺯﺰ',
- 'ژ': 'ﮊﮋ',
- 'س': 'ﺱﺲﺳﺴ',
- 'ش': 'ﺵﺶﺷﺸ',
- 'ص': 'ﺹﺺﺻﺼ',
- 'ض': 'ﺽﺾﺿﻀ',
- 'ط': 'ﻁﻂﻃﻄ',
- 'ظ': 'ﻅﻆﻇﻈ',
- 'ع': 'ﻉﻊﻋﻌ',
- 'غ': 'ﻍﻎﻏﻐ',
- 'ف': 'ﻑﻒﻓﻔ',
- 'ق': 'ﻕﻖﻗﻘ',
- 'ک': 'ﮎﮏﮐﮑﻙﻚﻛﻜ',
- 'گ': 'ﮒﮓﮔﮕ',
- 'ل': 'ﻝﻞﻟﻠ',
- 'م': 'ﻡﻢﻣﻤ',
- 'ن': 'ﻥﻦﻧﻨ',
- 'ه': 'ﻩﻪﻫﻬ',
- 'هٔ': 'ﮤﮥ',
- 'و': 'ﻭﻮ',
- 'ﺅ': 'ﺅﺆ',
- 'ی': 'ﯼﯽﯾﯿﻯﻰﻱﻲﻳﻴ',
- 'ئ': 'ﺉﺊﺋﺌ',
- 'لا': 'ﻼ',
- 'ﻹ': 'ﻺ',
- 'ﻷ': 'ﻸ',
- 'ﻵ': 'ﻶ'
- };
- /* eslint-enable */
-
- function cleanup (text, options) {
- if (typeof text !== 'string') {
- throw new TypeError('Expected a String, but received ' + typeof text);
- }
-
- // dont bother if its empty or whitespace
- if (!text.trim()) {
- return text;
- }
-
- var opts = options ? parseOptions(options) : this.opts;
-
- // single space paddings around the string
- text = ' ' + text + ' ';
-
- // preserves frontmatter data in the text
- if (opts.preserve_frontmatter) {
- var frontmatter = [];
- text = text.replace(/^ ---[\S\s]*?---\n/g, function (matched) {
- frontmatter.push(matched);
- return ' __FRONTMATTER__PRESERVER__ ';
- });
- }
-
- // preserves all html tags in the text
- // @props: @wordpress/wordcount
- if (opts.preserve_HTML) {
- var html = [];
- text = text.replace(/<\/?[a-z][^>]*?>/gi, function (matched) {
- html.push(matched);
- return ' __HTML__PRESERVER__ ';
- });
- }
-
- // preserves all html comments in the text
- // @props: @wordpress/wordcount
- if (opts.preserve_comments) {
- var comments = [];
- text = text.replace(/<!--[\s\S]*?-->/g, function (matched) {
- comments.push(matched);
- return ' __COMMENT__PRESERVER__ ';
- });
- }
-
- // preserves strings inside square brackets (`[]`)
- if (opts.preserve_brackets) {
- var brackets = [];
- text = text.replace(/(\[.*?\])/g, function (matched) {
- brackets.push(matched);
- return ' __BRACKETS__PRESERVER__ ';
- });
- }
-
- // preserves strings inside curly braces (`{}`)
- if (opts.preserve_braces) {
- var braces = [];
- text = text.replace(/(\{.*?\})/g, function (matched) {
- braces.push(matched);
- return ' __BRACES__PRESERVER__ ';
- });
- }
-
- // preserves all uri strings in the text
- if (opts.preserve_URIs) {
- var mdlinks = [];
- var uris = [];
-
- // stores markdown links separetly
- text = text.replace(/]\((.*?)\)/g, function (matched, link) {
- if (link) {
- mdlinks.push(link.trim());
- return '](__MD_LINK__PRESERVER__)'; // no padding!
- }
- return matched;
- });
-
- text = text.replace(newRegExp(patternURI), function (matched) {
- uris.push(matched);
- return ' __URI__PRESERVER__ ';
- });
- }
-
- // preserves all no-break space entities in the text
- if (opts.preserve_nbsps) {
- var nbsps = [];
- text = text.replace(/ | /gi, function (matched) {
- nbsps.push(matched);
- return ' __NBSPS__PRESERVER__ ';
- });
- }
-
- if (opts.decode_htmlentities) {
- text = decodeHTMLEntities(text);
- }
-
- // preserves all html entities in the text
- // @props: @substack/node-ent
- if (opts.preserve_entities) {
- var entities = [];
- text = text.replace(/&(#?[^;\W]+;?)/g, function (matched) {
- entities.push(matched);
- return ' __ENTITIES__PRESERVER__ ';
- });
- }
-
- if (opts.normalize_eol) {
- text = normalizeEOL(text);
- }
-
- if (opts.fix_persian_glyphs) {
- text = fixPersianGlyphs(text);
- }
-
- if (opts.fix_dashes) {
- text = fixDashes(text);
- }
-
- if (opts.fix_three_dots) {
- text = fixThreeDots(text);
- }
-
- if (opts.normalize_ellipsis) {
- text = normalizeEllipsis(text);
- }
-
- if (opts.fix_english_quotes_pairs) {
- text = fixEnglishQuotesPairs(text);
- }
-
- if (opts.fix_english_quotes) {
- text = fixEnglishQuotes(text);
- }
-
- if (opts.fix_hamzeh) {
- if (opts.fix_hamzeh_arabic) {
- text = fixHamzehArabic(text);
- }
-
- text = fixHamzeh(text);
- } else if (opts.fix_suffix_spacing) {
- if (opts.fix_hamzeh_arabic) {
- text = fixHamzehArabicAlt(text);
- }
-
- text = fixSuffixSpacingHamzeh(text);
- }
-
- if (opts.cleanup_rlm) {
- text = cleanupRLM(text);
- }
-
- if (opts.cleanup_zwnj) {
- text = cleanupZWNJ(text);
- }
-
- if (opts.fix_arabic_numbers) {
- text = fixArabicNumbers(text);
- }
-
- // word tokenizer
- text = text.replace(/(^|\s+)([[({"'“«]?)(\S+)([\])}"'”»]?)(?=($|\s+))/g,
- function (matched, before, leadings, word, trailings, after) {
- // should not replace to persian chars in english phrases
- if (word.match(/[a-zA-Z\-_]{2,}/g)) {
- return matched;
- }
-
- // should not touch sprintf directives
- // @source: https://stackoverflow.com/a/8915445/
- if (word.match(/%(?:\d+\$)?[+-]?(?:[ 0]|'.{1})?-?\d*(?:\.\d+)?[bcdeEufFgGosxX]/g)) {
- return matched;
- }
-
- // should not touch numbers in html entities
- if (word.match(/&#\d+;/g)) {
- return matched;
- }
-
- // skips converting english numbers of ordered lists in markdown
- if (opts.skip_markdown_ordered_lists_numbers_conversion && (matched + trailings + after).match(/(?:(?:\r?\n)|(?:\r\n?)|(?:^|\n))\d+\.\s/)) {
- return matched;
- }
-
- if (opts.fix_english_numbers) {
- matched = fixEnglishNumbers(matched);
- }
-
- if (opts.fix_numeral_symbols) {
- matched = fixNumeralSymbols(matched);
- }
-
- if (opts.fix_punctuations) {
- matched = fixPunctuations(matched);
- }
-
- if (opts.fix_misc_non_persian_chars) {
- matched = fixMiscNonPersianChars(matched);
- }
-
- if (opts.fix_question_mark) {
- matched = fixQuestionMark(matched);
- }
-
- return matched;
- }
- );
-
- if (opts.normalize_dates) {
- text = normalizeDates(text);
- }
-
- if (opts.fix_perfix_spacing) {
- text = fixPerfixSpacing(text);
- }
-
- if (opts.fix_suffix_spacing) {
- text = fixSuffixSpacing(text);
- }
-
- if (opts.fix_suffix_misc) {
- text = fixSuffixMisc(text);
- }
-
- if (opts.fix_spacing_for_braces_and_quotes) {
- text = fixBracesSpacing(text);
- }
-
- if (opts.cleanup_extra_marks) {
- text = cleanupExtraMarks(text);
- }
-
- if (opts.fix_spacing_for_punctuations) {
- text = fixPunctuationSpacing(text);
- }
-
- if (opts.kashidas_as_parenthetic) {
- text = kashidasAsParenthetic(text);
- }
-
- if (opts.cleanup_kashidas) {
- text = cleanupKashidas(text);
- }
-
- if (opts.markdown_normalize_braces) {
- text = markdownNormalizeBraces(text);
- }
-
- if (opts.markdown_normalize_lists) {
- text = markdownNormalizeLists(text);
- }
-
- // doing it again after `fixPunctuationSpacing()`
- if (opts.fix_spacing_for_braces_and_quotes) {
- text = fixBracesSpacingInside(text);
- }
-
- if (opts.fix_misc_spacing) {
- text = fixMiscSpacing(text);
- }
-
- if (opts.remove_diacritics) {
- text = removeDiacritics(text);
- } else if (opts.fix_diacritics) {
- text = fixDiacritics(text);
- }
-
- if (opts.cleanup_spacing) {
- text = cleanupSpacing(text);
- }
-
- if (opts.cleanup_zwnj) {
- text = cleanupZWNJLate(text);
- }
-
- if (opts.cleanup_line_breaks) {
- text = cleanupLineBreaks(text);
- }
-
- // bringing back entities
- if (opts.preserve_entities) {
- text = text.replace(/[ ]?__ENTITIES__PRESERVER__[ ]?/g, function () {
- return entities.shift();
- });
- }
-
- // bringing back nbsps
- if (opts.preserve_nbsps) {
- text = text.replace(/[ ]?__NBSPS__PRESERVER__[ ]?/g, function () {
- return nbsps.shift();
- });
- }
-
- // bringing back URIs
- if (opts.preserve_URIs) {
- // no padding!
- text = text.replace(/__MD_LINK__PRESERVER__/g, function () {
- return mdlinks.shift();
- });
-
- text = text.replace(/[ ]?__URI__PRESERVER__[ ]?/g, function () {
- return uris.shift();
- });
- }
-
- // bringing back braces
- if (opts.preserve_braces) {
- text = text.replace(/[ ]?__BRACES__PRESERVER__[ ]?/g, function () {
- return braces.shift();
- });
- }
-
- // bringing back brackets
- if (opts.preserve_brackets) {
- text = text.replace(/[ ]?__BRACKETS__PRESERVER__[ ]?/g, function () {
- return brackets.shift();
- });
- }
-
- // bringing back HTML comments
- if (opts.preserve_comments) {
- text = text.replace(/[ ]?__COMMENT__PRESERVER__[ ]?/g, function () {
- return comments.shift();
- });
- }
-
- // bringing back HTML tags
- if (opts.preserve_HTML) {
- text = text.replace(/[ ]?__HTML__PRESERVER__[ ]?/g, function () {
- return html.shift();
- });
- }
-
- // bringing back frontmatter
- if (opts.preserve_frontmatter) {
- text = text.replace(/[ ]?__FRONTMATTER__PRESERVER__[ ]?/g, function () {
- return frontmatter.shift();
- });
- }
-
- if (opts.cleanup_begin_and_end) {
- text = cleanupBeginAndEnd(text);
- } else {
- // removes single space paddings around the string
- text = text.replace(/^[ ]/g, '').replace(/[ ]$/g, '');
- }
-
- return text;
- }
-
- // props @ebraminio/persiantools
- function cleanupZWNJ (text) {
- return text
-
- // converts all soft hyphens (­) into zwnj
- .replace(/\u00ad/g, '\u200c')
-
- // removes more than one zwnj
- .replace(/\u200c{2,}/g, '\u200c')
-
- // cleans zwnj before and after numbers, english words, spaces and punctuations
- .replace(/\u200c([\w\s0-9۰-۹[\](){}«»“”.…,:;?!$%@#*=+\-/\\،؛٫٬×٪؟ـ])/g, '$1')
- .replace(/([\w\s0-9۰-۹[\](){}«»“”.…,:;?!$%@#*=+\-/\\،؛٫٬×٪؟ـ])\u200c/g, '$1')
-
- // removes unnecessary zwnj on start/end of each line
- .replace(/(^\u200c|\u200c$)/gm, '')
- ;
- }
-
- // late checks for zwnjs
- function cleanupZWNJLate (text) {
- return text
-
- // cleans zwnj after characters that don't conncet to the next
- .replace(/([إأةؤورزژاآدذ،؛,:«»\\/@#$٪×*()ـ\-=|])\u200c/g, '$1')
- ;
- }
-
- // converts numeral and selected html character-sets into original characters
- // @props: @substack/node-ent
- function decodeHTMLEntities (text) {
- return text.replace(/&(#?[^;\W]+;?)/g, function (matched, match) {
- var n;
- if ((n = /^#(\d+);?$/.exec(match))) {
- return String.fromCharCode(parseInt(n[1], 10));
- } else if ((n = /^#[Xx]([A-Fa-f0-9]+);?/.exec(match))) {
- return String.fromCharCode(parseInt(n[1], 16));
- } else {
- var hasSemi = /;$/.test(match);
- var withoutSemi = hasSemi ? match.replace(/;$/, '') : match;
- var target = entities[withoutSemi] || (hasSemi && entities[match]);
-
- if (typeof target === 'number') {
- return String.fromCharCode(target);
- } else if (typeof target === 'string') {
- return target;
- } else {
- return '&' + match;
- }
- }
- });
- }
-
- function normalizeEOL (text) {
- return text
-
- // replaces windows end of lines with unix eol (`\n`)
- .replace(/(\r?\n)|(\r\n?)/g, '\n')
- ;
- }
-
- function fixDashes (text) {
- return text
-
- // replaces triple dash to mdash
- .replace(/-{3}/g, '—')
-
- // replaces double dash to ndash
- .replace(/-{2}/g, '–')
- ;
- }
-
- function fixThreeDots (text) {
- return text
-
- // removes spaces between dots
- .replace(/\.([ ]+)(?=[.])/g, '.')
-
- // replaces three dots with ellipsis character
- .replace(/[ \t]*\.{3,}/g, '…')
- ;
- }
-
- function normalizeEllipsis (text) {
- return text
-
- // replaces more than one ellipsis with one
- .replace(/(…){2,}/g, '…')
-
- // replaces (space|tab|zwnj) after ellipsis with one space
- // NOTE: allows for space before ellipsis
- .replace(/([ ]{1,})*…[ \t\u200c]*/g, '$1… ')
- ;
- }
-
- function fixEnglishQuotesPairs (text) {
- return text
-
- // replaces english quote pairs with their persian equivalent
- .replace(/(“)(.+?)(”)/g, '«$2»')
- ;
- }
-
- // replaces english quote marks with their persian equivalent
- function fixEnglishQuotes (text) {
- return text
- .replace(/(["'`]+)(.+?)(\1)/g, '«$2»')
- ;
- }
-
- function fixHamzeh (text) {
- var replacement = '$1هٔ$3';
- return text
-
- // replaces ه followed by (space|ZWNJ|lrm) follow by ی with هٔ
- .replace(/(\S)(ه[\s\u200c\u200e]+[یي])([\s\u200c\u200e])/g, replacement) // heh + ye
-
- // replaces ه followed by (space|ZWNJ|lrm|nothing) follow by ء with هٔ
- .replace(/(\S)(ه[\s\u200c\u200e]?\u0621)([\s\u200c\u200e])/g, replacement) // heh + standalone hamza
-
- // replaces هٓ or single-character ۀ with the standard هٔ
- // props @ebraminio/persiantools
- .replace(/(ۀ|هٓ)/g, 'هٔ')
- ;
- }
-
- function fixHamzehArabic (text) {
- return text
-
- // converts arabic hamzeh ة to هٔ
- .replace(/(\S)ة([\s\u200c\u200e])/g, '$1هٔ$2')
- ;
- }
-
- function fixHamzehArabicAlt (text) {
- return text
- // converts arabic hamzeh ة to هی
- .replace(/(\S)ة([\s\u200c\u200e])/g, '$1هی$2')
- ;
- }
-
- function cleanupRLM (text) {
- return text
- // converts Right-to-left marks followed by persian characters to
- // zero-width non-joiners (ZWNJ)
- .replace(/([^a-zA-Z\-_])(\u200F)/g, '$1\u200c')
- ;
- }
-
- // converts incorrect persian glyphs to standard characters
- function fixPersianGlyphs (text) {
- return arrReplace(text, glyphs);
- }
-
- // props @ebraminio/persiantools
- function fixMiscNonPersianChars (text) {
- return charReplace(text, 'كڪيىۍېہە', 'ککییییههه');
- // return text
- // .replace(/ك/g, 'ک') // arabic kaf
- // .replace(/ڪ/g, 'ک') // arabic letter swash kaf
- // .replace(/ي/g, 'ی') // arabic
- // .replace(/ى/g, 'ی') // urdu
- // .replace(/ۍ/g, 'ی') // pushtu
- // .replace(/ې/g, 'ی') // uyghur
- // .replace(/ہ/g, 'ه') // converts ہ to ه ہہہہ to ههه
- // .replace(/[ەھ]/g, 'ه'); // kurdish
- }
-
- // replaces english numbers with their persian equivalent
- function fixEnglishNumbers (text) {
- return charReplace(text, '1234567890', digits);
- }
-
- // replaces arabic numbers with their persian equivalent
- function fixArabicNumbers (text) {
- return charReplace(text, '١٢٣٤٥٦٧٨٩٠', digits);
- }
-
- // @REF: https://github.com/shkarimpour/pholiday/pull/5/files
- function convertPersianNumbers (text) {
- return text.replace(/[\u0660-\u0669\u06f0-\u06f9]/g, function (char) {
- return char.charCodeAt(0) & 0xf;
- });
- }
-
- function fixNumeralSymbols (text) {
- return text
-
- // replaces english percent signs (U+066A)
- // props @ebraminio/persiantools
- .replace(/([۰-۹]) ?%/g, '$1٪')
-
- // replaces dots between numbers into decimal separator (U+066B)
- // props @ebraminio/persiantools
- .replace(/([۰-۹])\.(?=[۰-۹])/g, '$1٫')
-
- // replaces commas between numbers into thousands separator (U+066C)
- // props @languagetool-org
- .replace(/([۰-۹]),(?=[۰-۹])/g, '$1٬')
- ;
- }
-
- function normalizeDates (text) {
- return text
-
- // re-orders date parts with slash as delimiter
- .replace(/([0-9۰-۹]{1,2})([/-])([0-9۰-۹]{1,2})\2([0-9۰-۹]{4})/g, function (matched, day, delimiter, month, year) {
- return year + '/' + month + '/' + day;
- })
- ;
- }
-
- function fixPunctuations (text) {
- return charReplace(text, ',;', '،؛');
- }
-
- // replaces question marks with its persian equivalent
- function fixQuestionMark (text) {
- return text
- .replace(/(\?)/g, '\u061F') // \u061F = ؟
- ;
- }
-
- // puts zwnj between the word and the prefix:
- // - mi* nemi* bi*
- // NOTE: there's a possible bug here: prefixes could be separate nouns
- function fixPerfixSpacing (text) {
- var replacement = '$1\u200c$3';
- return text
- .replace(/((\s|^)ن?می) ([^ ])/g, replacement)
- .replace(/((\s|^)بی) ([^ ])/g, replacement) // props @zoghal
- ;
- }
-
- // puts zwnj between the word and the suffix
- // NOTE: possible bug: suffixes could be nouns
- function fixSuffixSpacing (text) {
- var replacement = '$1\u200c$2';
- return text
-
- // must done before others
- // *ha *haye
- .replace(newRegExp('([' + charsPersian + charsDiacritic + ']) (ها(ی)?[' + patternAfter + '])'), replacement)
-
- // *am *at *ash *ei *eid *eem *and *man *tan *shan
- .replace(newRegExp('([' + charsPersian + charsDiacritic + ']) ((ام|ات|اش|ای|اید|ایم|اند|مان|تان|شان)[' + patternAfter + '])'), replacement)
-
- // *tar *tari *tarin
- .replace(newRegExp('([' + charsPersian + charsDiacritic + ']) (تر((ی)|(ین))?[' + patternAfter + '])'), replacement)
-
- // *hayee *hayam *hayat *hayash *hayetan *hayeman *hayeshan
- .replace(newRegExp('([' + charsPersian + charsDiacritic + ']) ((هایی|هایم|هایت|هایش|هایمان|هایتان|هایشان)[' + patternAfter + '])'), replacement)
- ;
- }
-
- function fixSuffixSpacingHamzeh (text) {
- var replacement = '$1\u0647\u200c\u06cc$3';
- return text
-
- // heh + ye
- .replace(/(\S)(ه[\s\u200c]+[یي])([\s\u200c])/g, replacement)
-
- // heh + standalone hamza
- .replace(/(\S)(ه[\s\u200c]?\u0621)([\s\u200c])/g, replacement)
-
- // heh + hamza above
- .replace(/(\S)(ه[\s\u200c]?\u0654)([\s\u200c])/g, replacement)
- ;
- }
-
- function fixSuffixMisc (text) {
- return text
- // replaces ه followed by ئ or ی, and then by ی, with ه\u200cای,
- // EXAMPLE: خانهئی becomes خانهای
- // props @ebraminio/persiantools
- .replace(/(\S)ه[\u200c\u200e][ئی]ی([\s\u200c\u200e])/g, '$1ه\u200cای$2')
- ;
- }
-
- function cleanupExtraMarks (text) {
- return text
-
- // removes space between different/same marks (combining for cleanup)
- .replace(/([؟?!])([ ]+)(?=[؟?!])/g, '$1')
-
- // replaces more than one exclamation mark with just one
- .replace(/(!){2,}/g, '$1')
- // replaces more than one english or persian question mark with just one
- .replace(/(\u061F|\?){2,}/g, '$1') // \u061F = `؟`
- // re-orders consecutive marks
- .replace(/(!)([ \t]*)([\u061F?])/g, '$3$1') // `?!` --> `!?`
- ;
- }
-
- // replaces kashidas to ndash in parenthetic
- function kashidasAsParenthetic (text) {
- return text
- .replace(/(\s)\u0640+/g, '$1–')
- .replace(/\u0640+(\s)/g, '–$1')
- ;
- }
-
- function cleanupKashidas (text) {
- return text
- // converts kashida between numbers to ndash
- .replace(/([0-9۰-۹]+)ـ+([0-9۰-۹]+)/g, '$1–$2')
-
- // removes all kashidas between non-whitespace characters
- // MAYBE: more punctuations
- .replace(/([^\s.])\u0640+(?![\s.])/g, '$1')
- ;
- }
-
- function fixPunctuationSpacing (text) {
- return text
- // removes space before punctuations
- .replace(/[ \t\u200c]*([:;,؛،.؟?!]{1})/g, '$1')
-
- // removes more than one space after punctuations
- // except followed by new-lines (or preservers)
- .replace(/([:;,؛،.؟?!]{1})[ \t\u200c]*(?!\n|_{2})/g, '$1 ')
-
- // removes space after colon that separates time parts
- .replace(/([0-9۰-۹]+):\s+([0-9۰-۹]+)/g, '$1:$2')
-
- // removes space after dots in numbers
- .replace(/([0-9۰-۹]+)\. ([0-9۰-۹]+)/g, '$1.$2')
-
- // removes space before common domain tlds
- .replace(/([\w\-_]+)\. (ir|com|org|net|info|edu|me)([\s/\\\])»:;.])/g, '$1.$2$3')
-
- // removes space between different/same marks (double-check)
- .replace(/([؟?!])([ ]+)(?=[؟?!])/g, '$1')
- ;
- }
-
- function fixBracesSpacing (text) {
- var replacement = ' $1$2$3 ';
- return text
- // removes inside spaces and more than one outside
- // for `()`, `[]`, `{}`, `“”` and `«»`
- .replace(/[ \t\u200c]*(\()\s*([^)]+?)\s*?(\))[ \t\u200c]*/g, replacement)
- .replace(/[ \t\u200c]*(\[)\s*([^\]]+?)\s*?(\])[ \t\u200c]*/g, replacement)
- .replace(/[ \t\u200c]*(\{)\s*([^}]+?)\s*?(\})[ \t\u200c]*/g, replacement)
- .replace(/[ \t\u200c]*(“)\s*([^”]+?)\s*?(”)[ \t\u200c]*/g, replacement)
- .replace(/[ \t\u200c]*(«)\s*([^»]+?)\s*?(»)[ \t\u200c]*/g, replacement)
- ;
- }
-
- function fixBracesSpacingInside (text) {
- var replacement = '$1$2$3';
- return text
- // removes inside spaces for `()`, `[]`, `{}`, `“”` and `«»`
- .replace(/(\()\s*([^)]+?)\s*?(\))/g, replacement)
- .replace(/(\[)\s*([^\]]+?)\s*?(\])/g, replacement)
- .replace(/(\{)\s*([^}]+?)\s*?(\})/g, replacement)
- .replace(/(“)\s*([^”]+?)\s*?(”)/g, replacement)
- .replace(/(«)\s*([^»]+?)\s*?(»)/g, replacement)
-
- // NOTE: must be here, wierd not working if on `markdownNormalizeBraces()`
- // removes markdown link spaces inside normal ()
- .replace(/(\(\[.*?\]\(.*?\))\s+(\))/g, '$1$2')
- ;
- }
-
- function markdownNormalizeBraces (text) {
- return text
- // removes space between ! and opening brace on markdown images
- // EXAMPLE: `! [alt] (src)` --> ``
- .replace(/! (\[.*?\])[ ]?(\(.*?\))[ ]?/g, '!$1$2')
-
- // removes spaces between [] and ()
- // EXAMPLE: `[text] (link)` --> `[text](link)`
- .replace(/(\[.*?\])[ \t]+(\(.*?\))/g, '$1$2')
-
- // removes spaces inside double () [] {}
- // EXAMPLE: `[[ text ]]` --> `[[text]]`
- .replace(/\(\([ \t]*(.*?)[ \t]*\)\)/g, '(($1))')
- .replace(/\[\[[ \t]*(.*?)[ \t]*\]\]/g, '[[$1]]')
- .replace(/\{\{[ \t]*(.*?)[ \t]*\}\}/g, '{{$1}}')
- .replace(/\{\{\{[ \t]*(.*?)[ \t]*\}\}\}/g, '{{{$1}}}') // mustache escape
-
- // removes spaces between double () [] {}
- // EXAMPLE: `[[text] ]` --> `[[text]]`
- .replace(/(\(\(.*\))[ \t]+(\))/g, '$1$2')
- .replace(/(\[\[.*\])[ \t]+(\])/g, '$1$2')
- .replace(/(\{\{.*\})[ \t]+(\})/g, '$1$2')
- ;
- }
-
- function markdownNormalizeLists (text) {
- return text
- // removes extra line between two items list
- .replace(/((\n|^)\*.*?)\n+(?=\n\*)/g, '$1')
- .replace(/((\n|^)-.*?)\n+(?=\n-)/g, '$1')
- .replace(/((\n|^)#.*?)\n+(?=\n#)/g, '$1')
- ;
- }
-
- function fixMiscSpacing (text) {
- return text
-
- // removes space before parentheses on misc cases
- .replace(/ \((ص|عج|س|ع|ره)\)/g, '($1)')
-
- // removes space before braces containing numbers
- .replace(/ \[([0-9۰-۹]+)\]/g, '[$1]')
- ;
- }
-
- function fixDiacritics (text) {
- return text
- // cleans zwnj before diacritic characters
- .replace(newRegExp('\u200c([' + charsDiacritic + '])'), '$1')
-
- // cleans more than one diacritic characters
- // props @languagetool-org
- .replace(newRegExp('(.*)([' + charsDiacritic + ']){2,}(.*)'), '$1$2$3')
-
- // cleans spaces before diacritic characters
- .replace(newRegExp('(\\S)[ ]+([' + charsDiacritic + '])'), '$1$2')
- ;
- }
-
- function removeDiacritics (text) {
- return text
-
- // removes all diacritic characters
- .replace(newRegExp('[' + charsDiacritic + ']+'), '')
- ;
- }
-
- function cleanupSpacing (text) {
- return text
-
- // replaces more than one space with just a single one
- // except before/after preservers and before new-lines
- // .replace(/(?<![_]{2})([ ]{2,})(?![_]{2}|\n)/g, ' ') // WORKS: using lookbehind
- .replace(/([^_])([ ]{2,})(?![_]{2}|\n)/g, '$1 ')
-
- // cleans tab/space/zwnj/zwj/nbsp between two new-lines(\n)
- // @REF: https://stackoverflow.com/a/10965543/
- .replace(/^\n([\t\u0020\u200c\u200d\u00a0]*)\n$/gm, '\n\n')
-
- }
-
- function cleanupLineBreaks (text) {
- return text
-
- // cleans more than two contiguous line-breaks
- .replace(/\n{2,}/g, '\n\n')
- ;
- }
-
- function cleanupBeginAndEnd (text) {
- return text
-
- // removes space/tab/zwnj/nbsp from the beginning of the new-lines
- .replace(/([\n]+)[ \t\u200c\u00a0]*/g, '$1')
-
- // removes spaces, tabs, zwnj, direction marks and new lines from
- // the beginning and end of text
- // @REF: http://stackoverflow.com/a/38490203
- .replace(/^[\s\u200c\u200e\u200f]+|[\s\u200c\u200e\u200f]+$/g, '')
- ;
- }
-
- function flipPunctuations (text) {
- var end = ['-'];
- var start = ['!', '.', '،', '…', '"'];
- var before = [];
- var after = [];
-
- text = fixThreeDots(text);
-
- for (var iStart = 0; iStart < start.length; iStart++) {
- var sElement = start[iStart];
- var sReg = newRegExp('^\\' + sElement, 'i');
- if (sReg.test(text)) {
- text = text.replace(sReg, '').trim();
- after.push(sElement);
- }
- }
-
- for (var iEnd = 0; iEnd < end.length; iEnd++) {
- var eElement = end[iEnd];
- var eReg = newRegExp('\\' + eElement + '$', 'i');
- if (eReg.test(text)) {
- text = text.replace(eReg, '').trim();
- before.push(eElement);
- }
- }
-
- for (var iBefore = 0; iBefore < before.length; iBefore++) {
- text = before[iBefore] + ' ' + text;
- }
-
- for (var iAfter = 0; iAfter < after.length; iAfter++) {
- text += after[iAfter];
- }
-
- return normalizeEllipsis(text);
- }
-
- // swap incorrect quotes pairs `»«` to `«»` and `”“` to `“”`
- function swapQuotes (text) {
- return text
- .replace(/(»)(.+?)(«)/g, '«$2»')
- .replace(/(”)(.+?)(“)/g, '“$2”')
- ;
- }
-
- Virastar.prototype = {
-
- // public methods
- defaults: defaults,
- cleanup: cleanup,
-
- // internal methods
- // cleanupZWNJ: cleanupZWNJ,
- // cleanupZWNJLate: cleanupZWNJLate,
- // decodeHTMLEntities: decodeHTMLEntities,
- // normalizeEOL: normalizeEOL,
- // fixDashes: fixDashes,
- // fixThreeDots: fixThreeDots,
- // normalizeEllipsis: normalizeEllipsis,
- // fixEnglishQuotesPairs: fixEnglishQuotesPairs,
- // fixEnglishQuotes: fixEnglishQuotes,
- // fixHamzeh: fixHamzeh,
- // fixHamzehArabic: fixHamzehArabic,
- // fixHamzehArabicAlt: fixHamzehArabicAlt,
- // cleanupRLM: cleanupRLM,
- // fixPersianGlyphs: fixPersianGlyphs,
- // fixMiscNonPersianChars: fixMiscNonPersianChars,
- // fixEnglishNumbers: fixEnglishNumbers,
- // fixArabicNumbers: fixArabicNumbers,
- // fixNumeralSymbols: fixNumeralSymbols,
- // fixPunctuations: fixPunctuations,
- // fixQuestionMark: fixQuestionMark,
- // fixPerfixSpacing: fixPerfixSpacing,
- // fixSuffixSpacing: fixSuffixSpacing,
- // fixSuffixSpacingHamzeh: fixSuffixSpacingHamzeh,
- // fixSuffixMisc: fixSuffixMisc,
- // cleanupExtraMarks: cleanupExtraMarks,
- // kashidasAsParenthetic: kashidasAsParenthetic,
- // cleanupKashidas: cleanupKashidas,
- // fixPunctuationSpacing: fixPunctuationSpacing,
- // fixBracesSpacing: fixBracesSpacing,
- // fixBracesSpacingInside: fixBracesSpacingInside,
- // markdownNormalizeBraces: markdownNormalizeBraces,
- // markdownNormalizeLists: markdownNormalizeLists,
- // fixDiacritics: fixDiacritics,
- // cleanupSpacing: cleanupSpacing,
- // cleanupLineBreaks: cleanupLineBreaks,
- // cleanupBeginAndEnd: cleanupBeginAndEnd,
-
- // extra methods
- convertPersianNumbers: convertPersianNumbers,
- flipPunctuations: flipPunctuations,
- swapQuotes: swapQuotes
- };
-
- return Virastar;
- }));