htmlparser

HTML Parser By John Resig

Este script no debería instalarse directamente. Es una biblioteca que utilizan otros scripts mediante la meta-directiva de inclusión // @require https://update.greatest.deepsurf.us/scripts/4535/15389/htmlparser.js

  1. /*
  2. * HTML Parser By John Resig (ejohn.org)
  3. * Original code by Erik Arvidsson, Mozilla Public License
  4. * http://erik.eae.net/simplehtmlparser/simplehtmlparser.js
  5. *
  6. * // Use like so:
  7. * HTMLParser(htmlString, {
  8. * start: function(tag, attrs, unary) {},
  9. * end: function(tag) {},
  10. * chars: function(text) {},
  11. * comment: function(text) {}
  12. * });
  13. *
  14. * // or to get an XML string:
  15. * HTMLtoXML(htmlString);
  16. *
  17. * // or to get an XML DOM Document
  18. * HTMLtoDOM(htmlString);
  19. *
  20. * // or to inject into an existing document/DOM node
  21. * HTMLtoDOM(htmlString, document);
  22. * HTMLtoDOM(htmlString, document.body);
  23. *
  24. */
  25.  
  26. (function(){
  27.  
  28. // Regular Expressions for parsing tags and attributes
  29. var startTag = /^<([-A-Za-z0-9_]+)((?:\s+[-:A-Za-z0-9_]+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)>/,
  30. endTag = /^<\/([-A-Za-z0-9_]+)[^>]*>/,
  31. attr = /([-:A-Za-z0-9_]+)(?:\s*=\s*(?:(?:"((?:\\.|[^"])*)")|(?:'((?:\\.|[^'])*)')|([^>\s]+)))?/g;
  32. // Empty Elements - HTML 4.01
  33. var empty = makeMap("area,base,basefont,br,col,frame,hr,img,input,isindex,link,meta,param,embed");
  34.  
  35. // Block Elements - HTML 4.01
  36. var block = makeMap("address,applet,blockquote,button,center,dd,del,dir,div,dl,dt,fieldset,form,frameset,hr,iframe,isindex,li,map,menu,noframes,noscript,object,ol,p,pre,script,table,tbody,td,tfoot,th,thead,tr,ul");
  37.  
  38. // Inline Elements - HTML 4.01
  39. var inline = makeMap("a,abbr,acronym,applet,b,basefont,bdo,big,br,button,cite,code,del,dfn,em,font,i,iframe,img,input,ins,kbd,label,map,object,q,s,samp,script,select,small,span,strike,strong,sub,sup,textarea,tt,u,var");
  40.  
  41. // Elements that you can, intentionally, leave open
  42. // (and which close themselves)
  43. var closeSelf = makeMap("colgroup,dd,dt,li,options,p,td,tfoot,th,thead,tr");
  44.  
  45. // Attributes that have their values filled in disabled="disabled"
  46. var fillAttrs = makeMap("checked,compact,declare,defer,disabled,ismap,multiple,nohref,noresize,noshade,nowrap,readonly,selected");
  47.  
  48. // Special Elements (can contain anything)
  49. var special = makeMap("script,style");
  50.  
  51. var HTMLParser = this.HTMLParser = function( html, handler ) {
  52. var index, chars, match, stack = [], last = html;
  53. stack.last = function(){
  54. return this[ this.length - 1 ];
  55. };
  56.  
  57. while ( html ) {
  58. chars = true;
  59.  
  60. // Make sure we're not in a script or style element
  61. if ( !stack.last() || !special[ stack.last() ] ) {
  62.  
  63. // Comment
  64. if ( html.indexOf("<!--") == 0 ) {
  65. index = html.indexOf("-->");
  66. if ( index >= 0 ) {
  67. if ( handler.comment )
  68. handler.comment( html.substring( 4, index ) );
  69. html = html.substring( index + 3 );
  70. chars = false;
  71. }
  72. // end tag
  73. } else if ( html.indexOf("</") == 0 ) {
  74. match = html.match( endTag );
  75. if ( match ) {
  76. html = html.substring( match[0].length );
  77. match[0].replace( endTag, parseEndTag );
  78. chars = false;
  79. }
  80. // start tag
  81. } else if ( html.indexOf("<") == 0 ) {
  82. match = html.match( startTag );
  83. if ( match ) {
  84. html = html.substring( match[0].length );
  85. match[0].replace( startTag, parseStartTag );
  86. chars = false;
  87. }
  88. }
  89.  
  90. if ( chars ) {
  91. index = html.indexOf("<");
  92. var text = index < 0 ? html : html.substring( 0, index );
  93. html = index < 0 ? "" : html.substring( index );
  94. if ( handler.chars )
  95. handler.chars( text );
  96. }
  97.  
  98. } else {
  99. html = html.replace(new RegExp("(.*)<\/" + stack.last() + "[^>]*>"), function(all, text){
  100. text = text.replace(/<!--(.*?)-->/g, "$1")
  101. .replace(/<!\[CDATA\[(.*?)]]>/g, "$1");
  102.  
  103. if ( handler.chars )
  104. handler.chars( text );
  105.  
  106. return "";
  107. });
  108.  
  109. parseEndTag( "", stack.last() );
  110. }
  111.  
  112. if ( html == last )
  113. throw "Parse Error: " + html;
  114. last = html;
  115. }
  116. // Clean up any remaining tags
  117. parseEndTag();
  118.  
  119. function parseStartTag( tag, tagName, rest, unary ) {
  120. tagName = tagName.toLowerCase();
  121.  
  122. if ( block[ tagName ] ) {
  123. while ( stack.last() && inline[ stack.last() ] ) {
  124. parseEndTag( "", stack.last() );
  125. }
  126. }
  127.  
  128. if ( closeSelf[ tagName ] && stack.last() == tagName ) {
  129. parseEndTag( "", tagName );
  130. }
  131.  
  132. unary = empty[ tagName ] || !!unary;
  133.  
  134. if ( !unary )
  135. stack.push( tagName );
  136. if ( handler.start ) {
  137. var attrs = [];
  138. rest.replace(attr, function(match, name) {
  139. var value = arguments[2] ? arguments[2] :
  140. arguments[3] ? arguments[3] :
  141. arguments[4] ? arguments[4] :
  142. fillAttrs[name] ? name : "";
  143. attrs.push({
  144. name: name,
  145. value: value,
  146. escaped: value.replace(/(^|[^\\])"/g, '$1\\\"') //"
  147. });
  148. });
  149. if ( handler.start )
  150. handler.start( tagName, attrs, unary );
  151. }
  152. }
  153.  
  154. function parseEndTag( tag, tagName ) {
  155. // If no tag name is provided, clean shop
  156. if ( !tagName )
  157. var pos = 0;
  158. // Find the closest opened tag of the same type
  159. else
  160. {
  161. tagName = tagName.toLowerCase();
  162. for ( var pos = stack.length - 1; pos >= 0; pos-- )
  163. if ( stack[ pos ] == tagName )
  164. break;
  165. }
  166. if ( pos >= 0 ) {
  167. // Close all the open elements, up the stack
  168. for ( var i = stack.length - 1; i >= pos; i-- )
  169. if ( handler.end )
  170. handler.end( stack[ i ] );
  171. // Remove the open elements from the stack
  172. stack.length = pos;
  173. }
  174. }
  175. };
  176. this.HTMLtoXML = function( html ) {
  177. var results = "";
  178. HTMLParser(html, {
  179. start: function( tag, attrs, unary ) {
  180. results += "<" + tag;
  181. for ( var i = 0; i < attrs.length; i++ )
  182. results += " " + attrs[i].name + '="' + attrs[i].escaped + '"';
  183. results += (unary ? "/" : "") + ">";
  184. },
  185. end: function( tag ) {
  186. results += "</" + tag + ">";
  187. },
  188. chars: function( text ) {
  189. results += text;
  190. },
  191. comment: function( text ) {
  192. results += "<!--" + text + "-->";
  193. }
  194. });
  195. return results;
  196. };
  197. this.HTMLtoDOM = function( html, doc ) {
  198. // There can be only one of these elements
  199. var one = makeMap("html,head,body,title");
  200. // Enforce a structure for the document
  201. var structure = {
  202. link: "head",
  203. base: "head"
  204. };
  205. if ( !doc ) {
  206. if ( typeof DOMDocument != "undefined" )
  207. doc = new DOMDocument();
  208. else if ( typeof document != "undefined" && document.implementation && document.implementation.createDocument )
  209. doc = document.implementation.createDocument("", "", null);
  210. else if ( typeof ActiveX != "undefined" )
  211. doc = new ActiveXObject("Msxml.DOMDocument");
  212. } else
  213. doc = doc.ownerDocument ||
  214. doc.getOwnerDocument && doc.getOwnerDocument() ||
  215. doc;
  216. var elems = [],
  217. documentElement = doc.documentElement ||
  218. doc.getDocumentElement && doc.getDocumentElement();
  219. // If we're dealing with an empty document then we
  220. // need to pre-populate it with the HTML document structure
  221. if ( !documentElement && doc.createElement ) (function(){
  222. var html = doc.createElement("html");
  223. var head = doc.createElement("head");
  224. head.appendChild( doc.createElement("title") );
  225. html.appendChild( head );
  226. html.appendChild( doc.createElement("body") );
  227. doc.appendChild( html );
  228. })();
  229. // Find all the unique elements
  230. if ( doc.getElementsByTagName )
  231. for ( var i in one )
  232. one[ i ] = doc.getElementsByTagName( i )[0];
  233. // If we're working with a document, inject contents into
  234. // the body element
  235. var curParentNode = one.body;
  236. HTMLParser( html, {
  237. start: function( tagName, attrs, unary ) {
  238. // If it's a pre-built element, then we can ignore
  239. // its construction
  240. if ( one[ tagName ] ) {
  241. curParentNode = one[ tagName ];
  242. if ( !unary ) {
  243. elems.push( curParentNode );
  244. }
  245. return;
  246. }
  247. var elem = doc.createElement( tagName );
  248. for ( var attr in attrs )
  249. elem.setAttribute( attrs[ attr ].name, attrs[ attr ].value );
  250. if ( structure[ tagName ] && typeof one[ structure[ tagName ] ] != "boolean" )
  251. one[ structure[ tagName ] ].appendChild( elem );
  252. else if ( curParentNode && curParentNode.appendChild )
  253. curParentNode.appendChild( elem );
  254. if ( !unary ) {
  255. elems.push( elem );
  256. curParentNode = elem;
  257. }
  258. },
  259. end: function( tag ) {
  260. elems.length -= 1;
  261. // Init the new parentNode
  262. curParentNode = elems[ elems.length - 1 ];
  263. },
  264. chars: function( text ) {
  265. curParentNode.appendChild( doc.createTextNode( text ) );
  266. },
  267. comment: function( text ) {
  268. // create comment node
  269. }
  270. });
  271. return doc;
  272. };
  273.  
  274. function makeMap(str){
  275. var obj = {}, items = str.split(",");
  276. for ( var i = 0; i < items.length; i++ )
  277. obj[ items[i] ] = true;
  278. return obj;
  279. }
  280. })();