readability.js (88097B)
1 /*eslint-env es6:false*/ 2 /* 3 * Copyright (c) 2010 Arc90 Inc 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18 /* 19 * This code is heavily based on Arc90's readability.js (1.7.1) script 20 * available at: http://code.google.com/p/arc90labs-readability 21 */ 22 23 /** 24 * Public constructor. 25 * @param {HTMLDocument} doc The document to parse. 26 * @param {Object} options The options object. 27 */ 28 function Readability(doc, options) { 29 // In some older versions, people passed a URI as the first argument. Cope: 30 if (options && options.documentElement) { 31 doc = options; 32 options = arguments[2]; 33 } else if (!doc || !doc.documentElement) { 34 throw new Error( 35 "First argument to Readability constructor should be a document object.", 36 ); 37 } 38 options = options || {}; 39 40 this._doc = doc; 41 this._docJSDOMParser = this._doc.firstChild.__JSDOMParser__; 42 this._articleTitle = null; 43 this._articleByline = null; 44 this._articleDir = null; 45 this._articleSiteName = null; 46 this._attempts = []; 47 48 // Configurable options 49 this._debug = !!options.debug; 50 this._maxElemsToParse = 51 options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE; 52 this._nbTopCandidates = 53 options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES; 54 this._charThreshold = options.charThreshold || this.DEFAULT_CHAR_THRESHOLD; 55 this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat( 56 options.classesToPreserve || [], 57 ); 58 this._keepClasses = !!options.keepClasses; 59 this._serializer = 60 options.serializer || 61 function (el) { 62 return el.innerHTML; 63 }; 64 this._disableJSONLD = !!options.disableJSONLD; 65 66 // Start with all flags set 67 this._flags = 68 this.FLAG_STRIP_UNLIKELYS | 69 this.FLAG_WEIGHT_CLASSES | 70 this.FLAG_CLEAN_CONDITIONALLY; 71 72 // Control whether log messages are sent to the console 73 if (this._debug) { 74 let logNode = function (node) { 75 if (node.nodeType == node.TEXT_NODE) { 76 return `${node.nodeName} ("${node.textContent}")`; 77 } 78 let attrPairs = Array.from(node.attributes || [], function (attr) { 79 return `${attr.name}="${attr.value}"`; 80 }).join(" "); 81 return `<${node.localName} ${attrPairs}>`; 82 }; 83 this.log = function () { 84 if (typeof dump !== "undefined") { 85 var msg = Array.prototype.map 86 .call(arguments, function (x) { 87 return x && x.nodeName ? logNode(x) : x; 88 }) 89 .join(" "); 90 dump("Reader: (Readability) " + msg + "\n"); 91 } else if (typeof console !== "undefined") { 92 let args = Array.from(arguments, (arg) => { 93 if (arg && arg.nodeType == this.ELEMENT_NODE) { 94 return logNode(arg); 95 } 96 return arg; 97 }); 98 args.unshift("Reader: (Readability)"); 99 console.log.apply(console, args); 100 } 101 }; 102 } else { 103 this.log = function () {}; 104 } 105 } 106 107 Readability.prototype = { 108 FLAG_STRIP_UNLIKELYS: 0x1, 109 FLAG_WEIGHT_CLASSES: 0x2, 110 FLAG_CLEAN_CONDITIONALLY: 0x4, 111 112 // https://developer.mozilla.org/en-US/docs/Web/API/Node/nodeType 113 ELEMENT_NODE: 1, 114 TEXT_NODE: 3, 115 116 // Max number of nodes supported by this parser. Default: 0 (no limit) 117 DEFAULT_MAX_ELEMS_TO_PARSE: 0, 118 119 // The number of top candidates to consider when analysing how 120 // tight the competition is among candidates. 121 DEFAULT_N_TOP_CANDIDATES: 5, 122 123 // Element tags to score by default. 124 DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre" 125 .toUpperCase() 126 .split(","), 127 128 // The default number of chars an article must have in order to return a result 129 DEFAULT_CHAR_THRESHOLD: 500, 130 131 // All of the regular expressions in use within readability. 132 // Defined up here so we don't instantiate them repeatedly in loops. 133 REGEXPS: { 134 // NOTE: These two regular expressions are duplicated in 135 // Readability-readerable.js. Please keep both copies in sync. 136 unlikelyCandidates: 137 /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i, 138 okMaybeItsACandidate: /and|article|body|column|content|main|shadow/i, 139 140 positive: 141 /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i, 142 negative: 143 /-ad-|hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i, 144 extraneous: 145 /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i, 146 byline: /byline|author|dateline|writtenby|p-author/i, 147 replaceFonts: /<(\/?)font[^>]*>/gi, 148 normalize: /\s{2,}/g, 149 videos: 150 /\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i, 151 shareElements: /(\b|_)(share|sharedaddy)(\b|_)/i, 152 nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, 153 prevLink: /(prev|earl|old|new|<|«)/i, 154 tokenize: /\W+/g, 155 whitespace: /^\s*$/, 156 hasContent: /\S$/, 157 hashUrl: /^#.+/, 158 srcsetUrl: /(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))/g, 159 b64DataUrl: /^data:\s*([^\s;,]+)\s*;\s*base64\s*,/i, 160 // See: https://schema.org/Article 161 jsonLdArticleTypes: 162 /^Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference$/, 163 }, 164 165 UNLIKELY_ROLES: [ 166 "menu", 167 "menubar", 168 "complementary", 169 "navigation", 170 "alert", 171 "alertdialog", 172 "dialog", 173 ], 174 175 DIV_TO_P_ELEMS: new Set([ 176 "BLOCKQUOTE", 177 "DL", 178 "DIV", 179 "IMG", 180 "OL", 181 "P", 182 "PRE", 183 "TABLE", 184 "UL", 185 ]), 186 187 ALTER_TO_DIV_EXCEPTIONS: ["DIV", "ARTICLE", "SECTION", "P"], 188 189 PRESENTATIONAL_ATTRIBUTES: [ 190 "align", 191 "background", 192 "bgcolor", 193 "border", 194 "cellpadding", 195 "cellspacing", 196 "frame", 197 "hspace", 198 "rules", 199 "style", 200 "valign", 201 "vspace", 202 ], 203 204 DEPRECATED_SIZE_ATTRIBUTE_ELEMS: ["TABLE", "TH", "TD", "HR", "PRE"], 205 206 // The commented out elements qualify as phrasing content but tend to be 207 // removed by readability when put into paragraphs, so we ignore them here. 208 PHRASING_ELEMS: [ 209 // "CANVAS", "IFRAME", "SVG", "VIDEO", 210 "ABBR", 211 "AUDIO", 212 "B", 213 "BDO", 214 "BR", 215 "BUTTON", 216 "CITE", 217 "CODE", 218 "DATA", 219 "DATALIST", 220 "DFN", 221 "EM", 222 "EMBED", 223 "I", 224 "IMG", 225 "INPUT", 226 "KBD", 227 "LABEL", 228 "MARK", 229 "MATH", 230 "METER", 231 "NOSCRIPT", 232 "OBJECT", 233 "OUTPUT", 234 "PROGRESS", 235 "Q", 236 "RUBY", 237 "SAMP", 238 "SCRIPT", 239 "SELECT", 240 "SMALL", 241 "SPAN", 242 "STRONG", 243 "SUB", 244 "SUP", 245 "TEXTAREA", 246 "TIME", 247 "VAR", 248 "WBR", 249 ], 250 251 // These are the classes that readability sets itself. 252 CLASSES_TO_PRESERVE: ["page"], 253 254 // These are the list of HTML entities that need to be escaped. 255 HTML_ESCAPE_MAP: { 256 lt: "<", 257 gt: ">", 258 amp: "&", 259 quot: '"', 260 apos: "'", 261 }, 262 263 /** 264 * Run any post-process modifications to article content as necessary. 265 * 266 * @param Element 267 * @return void 268 **/ 269 _postProcessContent: function (articleContent) { 270 // Readability cannot open relative uris so we convert them to absolute uris. 271 this._fixRelativeUris(articleContent); 272 273 this._simplifyNestedElements(articleContent); 274 275 if (!this._keepClasses) { 276 // Remove classes. 277 this._cleanClasses(articleContent); 278 } 279 }, 280 281 /** 282 * Iterates over a NodeList, calls `filterFn` for each node and removes node 283 * if function returned `true`. 284 * 285 * If function is not passed, removes all the nodes in node list. 286 * 287 * @param NodeList nodeList The nodes to operate on 288 * @param Function filterFn the function to use as a filter 289 * @return void 290 */ 291 _removeNodes: function (nodeList, filterFn) { 292 // Avoid ever operating on live node lists. 293 if (this._docJSDOMParser && nodeList._isLiveNodeList) { 294 throw new Error("Do not pass live node lists to _removeNodes"); 295 } 296 for (var i = nodeList.length - 1; i >= 0; i--) { 297 var node = nodeList[i]; 298 var parentNode = node.parentNode; 299 if (parentNode) { 300 if (!filterFn || filterFn.call(this, node, i, nodeList)) { 301 parentNode.removeChild(node); 302 } 303 } 304 } 305 }, 306 307 /** 308 * Iterates over a NodeList, and calls _setNodeTag for each node. 309 * 310 * @param NodeList nodeList The nodes to operate on 311 * @param String newTagName the new tag name to use 312 * @return void 313 */ 314 _replaceNodeTags: function (nodeList, newTagName) { 315 // Avoid ever operating on live node lists. 316 if (this._docJSDOMParser && nodeList._isLiveNodeList) { 317 throw new Error("Do not pass live node lists to _replaceNodeTags"); 318 } 319 for (const node of nodeList) { 320 this._setNodeTag(node, newTagName); 321 } 322 }, 323 324 /** 325 * Iterate over a NodeList, which doesn't natively fully implement the Array 326 * interface. 327 * 328 * For convenience, the current object context is applied to the provided 329 * iterate function. 330 * 331 * @param NodeList nodeList The NodeList. 332 * @param Function fn The iterate function. 333 * @return void 334 */ 335 _forEachNode: function (nodeList, fn) { 336 Array.prototype.forEach.call(nodeList, fn, this); 337 }, 338 339 /** 340 * Iterate over a NodeList, and return the first node that passes 341 * the supplied test function 342 * 343 * For convenience, the current object context is applied to the provided 344 * test function. 345 * 346 * @param NodeList nodeList The NodeList. 347 * @param Function fn The test function. 348 * @return void 349 */ 350 _findNode: function (nodeList, fn) { 351 return Array.prototype.find.call(nodeList, fn, this); 352 }, 353 354 /** 355 * Iterate over a NodeList, return true if any of the provided iterate 356 * function calls returns true, false otherwise. 357 * 358 * For convenience, the current object context is applied to the 359 * provided iterate function. 360 * 361 * @param NodeList nodeList The NodeList. 362 * @param Function fn The iterate function. 363 * @return Boolean 364 */ 365 _someNode: function (nodeList, fn) { 366 return Array.prototype.some.call(nodeList, fn, this); 367 }, 368 369 /** 370 * Iterate over a NodeList, return true if all of the provided iterate 371 * function calls return true, false otherwise. 372 * 373 * For convenience, the current object context is applied to the 374 * provided iterate function. 375 * 376 * @param NodeList nodeList The NodeList. 377 * @param Function fn The iterate function. 378 * @return Boolean 379 */ 380 _everyNode: function (nodeList, fn) { 381 return Array.prototype.every.call(nodeList, fn, this); 382 }, 383 384 /** 385 * Concat all nodelists passed as arguments. 386 * 387 * @return ...NodeList 388 * @return Array 389 */ 390 _concatNodeLists: function () { 391 var slice = Array.prototype.slice; 392 var args = slice.call(arguments); 393 var nodeLists = args.map(function (list) { 394 return slice.call(list); 395 }); 396 return Array.prototype.concat.apply([], nodeLists); 397 }, 398 399 _getAllNodesWithTag: function (node, tagNames) { 400 if (node.querySelectorAll) { 401 return node.querySelectorAll(tagNames.join(",")); 402 } 403 return [].concat.apply( 404 [], 405 tagNames.map(function (tag) { 406 var collection = node.getElementsByTagName(tag); 407 return Array.isArray(collection) ? collection : Array.from(collection); 408 }), 409 ); 410 }, 411 412 /** 413 * Removes the class="" attribute from every element in the given 414 * subtree, except those that match CLASSES_TO_PRESERVE and 415 * the classesToPreserve array from the options object. 416 * 417 * @param Element 418 * @return void 419 */ 420 _cleanClasses: function (node) { 421 var classesToPreserve = this._classesToPreserve; 422 var className = (node.getAttribute("class") || "") 423 .split(/\s+/) 424 .filter(function (cls) { 425 return classesToPreserve.indexOf(cls) != -1; 426 }) 427 .join(" "); 428 429 if (className) { 430 node.setAttribute("class", className); 431 } else { 432 node.removeAttribute("class"); 433 } 434 435 for (node = node.firstElementChild; node; node = node.nextElementSibling) { 436 this._cleanClasses(node); 437 } 438 }, 439 440 /** 441 * Converts each <a> and <img> uri in the given element to an absolute URI, 442 * ignoring #ref URIs. 443 * 444 * @param Element 445 * @return void 446 */ 447 _fixRelativeUris: function (articleContent) { 448 var baseURI = this._doc.baseURI; 449 var documentURI = this._doc.documentURI; 450 function toAbsoluteURI(uri) { 451 // Leave hash links alone if the base URI matches the document URI: 452 if (baseURI == documentURI && uri.charAt(0) == "#") { 453 return uri; 454 } 455 456 // Otherwise, resolve against base URI: 457 try { 458 return new URL(uri, baseURI).href; 459 } catch (ex) { 460 // Something went wrong, just return the original: 461 } 462 return uri; 463 } 464 465 var links = this._getAllNodesWithTag(articleContent, ["a"]); 466 this._forEachNode(links, function (link) { 467 var href = link.getAttribute("href"); 468 if (href) { 469 // Remove links with javascript: URIs, since 470 // they won't work after scripts have been removed from the page. 471 if (href.indexOf("javascript:") === 0) { 472 // if the link only contains simple text content, it can be converted to a text node 473 if ( 474 link.childNodes.length === 1 && 475 link.childNodes[0].nodeType === this.TEXT_NODE 476 ) { 477 var text = this._doc.createTextNode(link.textContent); 478 link.parentNode.replaceChild(text, link); 479 } else { 480 // if the link has multiple children, they should all be preserved 481 var container = this._doc.createElement("span"); 482 while (link.firstChild) { 483 container.appendChild(link.firstChild); 484 } 485 link.parentNode.replaceChild(container, link); 486 } 487 } else { 488 link.setAttribute("href", toAbsoluteURI(href)); 489 } 490 } 491 }); 492 493 var medias = this._getAllNodesWithTag(articleContent, [ 494 "img", 495 "picture", 496 "figure", 497 "video", 498 "audio", 499 "source", 500 ]); 501 502 this._forEachNode(medias, function (media) { 503 var src = media.getAttribute("src"); 504 var poster = media.getAttribute("poster"); 505 var srcset = media.getAttribute("srcset"); 506 507 if (src) { 508 media.setAttribute("src", toAbsoluteURI(src)); 509 } 510 511 if (poster) { 512 media.setAttribute("poster", toAbsoluteURI(poster)); 513 } 514 515 if (srcset) { 516 var newSrcset = srcset.replace( 517 this.REGEXPS.srcsetUrl, 518 function (_, p1, p2, p3) { 519 return toAbsoluteURI(p1) + (p2 || "") + p3; 520 }, 521 ); 522 523 media.setAttribute("srcset", newSrcset); 524 } 525 }); 526 }, 527 528 _simplifyNestedElements: function (articleContent) { 529 var node = articleContent; 530 531 while (node) { 532 if ( 533 node.parentNode && 534 ["DIV", "SECTION"].includes(node.tagName) && 535 !(node.id && node.id.startsWith("readability")) 536 ) { 537 if (this._isElementWithoutContent(node)) { 538 node = this._removeAndGetNext(node); 539 continue; 540 } else if ( 541 this._hasSingleTagInsideElement(node, "DIV") || 542 this._hasSingleTagInsideElement(node, "SECTION") 543 ) { 544 var child = node.children[0]; 545 for (var i = 0; i < node.attributes.length; i++) { 546 child.setAttribute( 547 node.attributes[i].name, 548 node.attributes[i].value, 549 ); 550 } 551 node.parentNode.replaceChild(child, node); 552 node = child; 553 continue; 554 } 555 } 556 557 node = this._getNextNode(node); 558 } 559 }, 560 561 /** 562 * Get the article title as an H1. 563 * 564 * @return string 565 **/ 566 _getArticleTitle: function () { 567 var doc = this._doc; 568 var curTitle = ""; 569 var origTitle = ""; 570 571 try { 572 curTitle = origTitle = doc.title.trim(); 573 574 // If they had an element with id "title" in their HTML 575 if (typeof curTitle !== "string") 576 curTitle = origTitle = this._getInnerText( 577 doc.getElementsByTagName("title")[0], 578 ); 579 } catch (e) { 580 /* ignore exceptions setting the title. */ 581 } 582 583 var titleHadHierarchicalSeparators = false; 584 function wordCount(str) { 585 return str.split(/\s+/).length; 586 } 587 588 // If there's a separator in the title, first remove the final part 589 if (/ [\|\-\\\/>»] /.test(curTitle)) { 590 titleHadHierarchicalSeparators = / [\\\/>»] /.test(curTitle); 591 curTitle = origTitle.replace(/(.*)[\|\-\\\/>»] .*/gi, "$1"); 592 593 // If the resulting title is too short (3 words or fewer), remove 594 // the first part instead: 595 if (wordCount(curTitle) < 3) 596 curTitle = origTitle.replace(/[^\|\-\\\/>»]*[\|\-\\\/>»](.*)/gi, "$1"); 597 } else if (curTitle.indexOf(": ") !== -1) { 598 // Check if we have an heading containing this exact string, so we 599 // could assume it's the full title. 600 var headings = this._concatNodeLists( 601 doc.getElementsByTagName("h1"), 602 doc.getElementsByTagName("h2"), 603 ); 604 var trimmedTitle = curTitle.trim(); 605 var match = this._someNode(headings, function (heading) { 606 return heading.textContent.trim() === trimmedTitle; 607 }); 608 609 // If we don't, let's extract the title out of the original title string. 610 if (!match) { 611 curTitle = origTitle.substring(origTitle.lastIndexOf(":") + 1); 612 613 // If the title is now too short, try the first colon instead: 614 if (wordCount(curTitle) < 3) { 615 curTitle = origTitle.substring(origTitle.indexOf(":") + 1); 616 // But if we have too many words before the colon there's something weird 617 // with the titles and the H tags so let's just use the original title instead 618 } else if (wordCount(origTitle.substr(0, origTitle.indexOf(":"))) > 5) { 619 curTitle = origTitle; 620 } 621 } 622 } else if (curTitle.length > 150 || curTitle.length < 15) { 623 var hOnes = doc.getElementsByTagName("h1"); 624 625 if (hOnes.length === 1) curTitle = this._getInnerText(hOnes[0]); 626 } 627 628 curTitle = curTitle.trim().replace(this.REGEXPS.normalize, " "); 629 // If we now have 4 words or fewer as our title, and either no 630 // 'hierarchical' separators (\, /, > or ») were found in the original 631 // title or we decreased the number of words by more than 1 word, use 632 // the original title. 633 var curTitleWordCount = wordCount(curTitle); 634 if ( 635 curTitleWordCount <= 4 && 636 (!titleHadHierarchicalSeparators || 637 curTitleWordCount != 638 wordCount(origTitle.replace(/[\|\-\\\/>»]+/g, "")) - 1) 639 ) { 640 curTitle = origTitle; 641 } 642 643 return curTitle; 644 }, 645 646 /** 647 * Prepare the HTML document for readability to scrape it. 648 * This includes things like stripping javascript, CSS, and handling terrible markup. 649 * 650 * @return void 651 **/ 652 _prepDocument: function () { 653 var doc = this._doc; 654 655 // Remove all style tags in head 656 this._removeNodes(this._getAllNodesWithTag(doc, ["style"])); 657 658 if (doc.body) { 659 this._replaceBrs(doc.body); 660 } 661 662 this._replaceNodeTags(this._getAllNodesWithTag(doc, ["font"]), "SPAN"); 663 }, 664 665 /** 666 * Finds the next node, starting from the given node, and ignoring 667 * whitespace in between. If the given node is an element, the same node is 668 * returned. 669 */ 670 _nextNode: function (node) { 671 var next = node; 672 while ( 673 next && 674 next.nodeType != this.ELEMENT_NODE && 675 this.REGEXPS.whitespace.test(next.textContent) 676 ) { 677 next = next.nextSibling; 678 } 679 return next; 680 }, 681 682 /** 683 * Replaces 2 or more successive <br> elements with a single <p>. 684 * Whitespace between <br> elements are ignored. For example: 685 * <div>foo<br>bar<br> <br><br>abc</div> 686 * will become: 687 * <div>foo<br>bar<p>abc</p></div> 688 */ 689 _replaceBrs: function (elem) { 690 this._forEachNode(this._getAllNodesWithTag(elem, ["br"]), function (br) { 691 var next = br.nextSibling; 692 693 // Whether 2 or more <br> elements have been found and replaced with a 694 // <p> block. 695 var replaced = false; 696 697 // If we find a <br> chain, remove the <br>s until we hit another node 698 // or non-whitespace. This leaves behind the first <br> in the chain 699 // (which will be replaced with a <p> later). 700 while ((next = this._nextNode(next)) && next.tagName == "BR") { 701 replaced = true; 702 var brSibling = next.nextSibling; 703 next.parentNode.removeChild(next); 704 next = brSibling; 705 } 706 707 // If we removed a <br> chain, replace the remaining <br> with a <p>. Add 708 // all sibling nodes as children of the <p> until we hit another <br> 709 // chain. 710 if (replaced) { 711 var p = this._doc.createElement("p"); 712 br.parentNode.replaceChild(p, br); 713 714 next = p.nextSibling; 715 while (next) { 716 // If we've hit another <br><br>, we're done adding children to this <p>. 717 if (next.tagName == "BR") { 718 var nextElem = this._nextNode(next.nextSibling); 719 if (nextElem && nextElem.tagName == "BR") break; 720 } 721 722 if (!this._isPhrasingContent(next)) break; 723 724 // Otherwise, make this node a child of the new <p>. 725 var sibling = next.nextSibling; 726 p.appendChild(next); 727 next = sibling; 728 } 729 730 while (p.lastChild && this._isWhitespace(p.lastChild)) { 731 p.removeChild(p.lastChild); 732 } 733 734 if (p.parentNode.tagName === "P") this._setNodeTag(p.parentNode, "DIV"); 735 } 736 }); 737 }, 738 739 _setNodeTag: function (node, tag) { 740 this.log("_setNodeTag", node, tag); 741 if (this._docJSDOMParser) { 742 node.localName = tag.toLowerCase(); 743 node.tagName = tag.toUpperCase(); 744 return node; 745 } 746 747 var replacement = node.ownerDocument.createElement(tag); 748 while (node.firstChild) { 749 replacement.appendChild(node.firstChild); 750 } 751 node.parentNode.replaceChild(replacement, node); 752 if (node.readability) replacement.readability = node.readability; 753 754 for (var i = 0; i < node.attributes.length; i++) { 755 try { 756 replacement.setAttribute( 757 node.attributes[i].name, 758 node.attributes[i].value, 759 ); 760 } catch (ex) { 761 /* it's possible for setAttribute() to throw if the attribute name 762 * isn't a valid XML Name. Such attributes can however be parsed from 763 * source in HTML docs, see https://github.com/whatwg/html/issues/4275, 764 * so we can hit them here and then throw. We don't care about such 765 * attributes so we ignore them. 766 */ 767 } 768 } 769 return replacement; 770 }, 771 772 /** 773 * Prepare the article node for display. Clean out any inline styles, 774 * iframes, forms, strip extraneous <p> tags, etc. 775 * 776 * @param Element 777 * @return void 778 **/ 779 _prepArticle: function (articleContent) { 780 this._cleanStyles(articleContent); 781 782 // Check for data tables before we continue, to avoid removing items in 783 // those tables, which will often be isolated even though they're 784 // visually linked to other content-ful elements (text, images, etc.). 785 this._markDataTables(articleContent); 786 787 this._fixLazyImages(articleContent); 788 789 // Clean out junk from the article content 790 this._cleanConditionally(articleContent, "form"); 791 this._cleanConditionally(articleContent, "fieldset"); 792 this._clean(articleContent, "object"); 793 this._clean(articleContent, "embed"); 794 this._clean(articleContent, "footer"); 795 this._clean(articleContent, "link"); 796 this._clean(articleContent, "aside"); 797 798 // Clean out elements with little content that have "share" in their id/class combinations from final top candidates, 799 // which means we don't remove the top candidates even they have "share". 800 801 var shareElementThreshold = this.DEFAULT_CHAR_THRESHOLD; 802 803 this._forEachNode(articleContent.children, function (topCandidate) { 804 this._cleanMatchedNodes(topCandidate, function (node, matchString) { 805 return ( 806 this.REGEXPS.shareElements.test(matchString) && 807 node.textContent.length < shareElementThreshold 808 ); 809 }); 810 }); 811 812 this._clean(articleContent, "iframe"); 813 this._clean(articleContent, "input"); 814 this._clean(articleContent, "textarea"); 815 this._clean(articleContent, "select"); 816 this._clean(articleContent, "button"); 817 this._cleanHeaders(articleContent); 818 819 // Do these last as the previous stuff may have removed junk 820 // that will affect these 821 this._cleanConditionally(articleContent, "table"); 822 this._cleanConditionally(articleContent, "ul"); 823 this._cleanConditionally(articleContent, "div"); 824 825 // replace H1 with H2 as H1 should be only title that is displayed separately 826 this._replaceNodeTags( 827 this._getAllNodesWithTag(articleContent, ["h1"]), 828 "h2", 829 ); 830 831 // Remove extra paragraphs 832 this._removeNodes( 833 this._getAllNodesWithTag(articleContent, ["p"]), 834 function (paragraph) { 835 var imgCount = paragraph.getElementsByTagName("img").length; 836 var embedCount = paragraph.getElementsByTagName("embed").length; 837 var objectCount = paragraph.getElementsByTagName("object").length; 838 // At this point, nasty iframes have been removed, only remain embedded video ones. 839 var iframeCount = paragraph.getElementsByTagName("iframe").length; 840 var totalCount = imgCount + embedCount + objectCount + iframeCount; 841 842 return totalCount === 0 && !this._getInnerText(paragraph, false); 843 }, 844 ); 845 846 this._forEachNode( 847 this._getAllNodesWithTag(articleContent, ["br"]), 848 function (br) { 849 var next = this._nextNode(br.nextSibling); 850 if (next && next.tagName == "P") br.parentNode.removeChild(br); 851 }, 852 ); 853 854 // Remove single-cell tables 855 this._forEachNode( 856 this._getAllNodesWithTag(articleContent, ["table"]), 857 function (table) { 858 var tbody = this._hasSingleTagInsideElement(table, "TBODY") 859 ? table.firstElementChild 860 : table; 861 if (this._hasSingleTagInsideElement(tbody, "TR")) { 862 var row = tbody.firstElementChild; 863 if (this._hasSingleTagInsideElement(row, "TD")) { 864 var cell = row.firstElementChild; 865 cell = this._setNodeTag( 866 cell, 867 this._everyNode(cell.childNodes, this._isPhrasingContent) 868 ? "P" 869 : "DIV", 870 ); 871 table.parentNode.replaceChild(cell, table); 872 } 873 } 874 }, 875 ); 876 }, 877 878 /** 879 * Initialize a node with the readability object. Also checks the 880 * className/id for special names to add to its score. 881 * 882 * @param Element 883 * @return void 884 **/ 885 _initializeNode: function (node) { 886 node.readability = { contentScore: 0 }; 887 888 switch (node.tagName) { 889 case "DIV": 890 node.readability.contentScore += 5; 891 break; 892 893 case "PRE": 894 case "TD": 895 case "BLOCKQUOTE": 896 node.readability.contentScore += 3; 897 break; 898 899 case "ADDRESS": 900 case "OL": 901 case "UL": 902 case "DL": 903 case "DD": 904 case "DT": 905 case "LI": 906 case "FORM": 907 node.readability.contentScore -= 3; 908 break; 909 910 case "H1": 911 case "H2": 912 case "H3": 913 case "H4": 914 case "H5": 915 case "H6": 916 case "TH": 917 node.readability.contentScore -= 5; 918 break; 919 } 920 921 node.readability.contentScore += this._getClassWeight(node); 922 }, 923 924 _removeAndGetNext: function (node) { 925 var nextNode = this._getNextNode(node, true); 926 node.parentNode.removeChild(node); 927 return nextNode; 928 }, 929 930 /** 931 * Traverse the DOM from node to node, starting at the node passed in. 932 * Pass true for the second parameter to indicate this node itself 933 * (and its kids) are going away, and we want the next node over. 934 * 935 * Calling this in a loop will traverse the DOM depth-first. 936 */ 937 _getNextNode: function (node, ignoreSelfAndKids) { 938 // First check for kids if those aren't being ignored 939 if (!ignoreSelfAndKids && node.firstElementChild) { 940 return node.firstElementChild; 941 } 942 // Then for siblings... 943 if (node.nextElementSibling) { 944 return node.nextElementSibling; 945 } 946 // And finally, move up the parent chain *and* find a sibling 947 // (because this is depth-first traversal, we will have already 948 // seen the parent nodes themselves). 949 do { 950 node = node.parentNode; 951 } while (node && !node.nextElementSibling); 952 return node && node.nextElementSibling; 953 }, 954 955 // compares second text to first one 956 // 1 = same text, 0 = completely different text 957 // works the way that it splits both texts into words and then finds words that are unique in second text 958 // the result is given by the lower length of unique parts 959 _textSimilarity: function (textA, textB) { 960 var tokensA = textA 961 .toLowerCase() 962 .split(this.REGEXPS.tokenize) 963 .filter(Boolean); 964 var tokensB = textB 965 .toLowerCase() 966 .split(this.REGEXPS.tokenize) 967 .filter(Boolean); 968 if (!tokensA.length || !tokensB.length) { 969 return 0; 970 } 971 var uniqTokensB = tokensB.filter((token) => !tokensA.includes(token)); 972 var distanceB = uniqTokensB.join(" ").length / tokensB.join(" ").length; 973 return 1 - distanceB; 974 }, 975 976 _checkByline: function (node, matchString) { 977 if (this._articleByline) { 978 return false; 979 } 980 981 if (node.getAttribute !== undefined) { 982 var rel = node.getAttribute("rel"); 983 var itemprop = node.getAttribute("itemprop"); 984 } 985 986 if ( 987 (rel === "author" || 988 (itemprop && itemprop.indexOf("author") !== -1) || 989 this.REGEXPS.byline.test(matchString)) && 990 this._isValidByline(node.textContent) 991 ) { 992 this._articleByline = node.textContent.trim(); 993 return true; 994 } 995 996 return false; 997 }, 998 999 _getNodeAncestors: function (node, maxDepth) { 1000 maxDepth = maxDepth || 0; 1001 var i = 0, 1002 ancestors = []; 1003 while (node.parentNode) { 1004 ancestors.push(node.parentNode); 1005 if (maxDepth && ++i === maxDepth) break; 1006 node = node.parentNode; 1007 } 1008 return ancestors; 1009 }, 1010 1011 /*** 1012 * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is 1013 * most likely to be the stuff a user wants to read. Then return it wrapped up in a div. 1014 * 1015 * @param page a document to run upon. Needs to be a full document, complete with body. 1016 * @return Element 1017 **/ 1018 _grabArticle: function (page) { 1019 this.log("**** grabArticle ****"); 1020 var doc = this._doc; 1021 var isPaging = page !== null; 1022 page = page ? page : this._doc.body; 1023 1024 // We can't grab an article if we don't have a page! 1025 if (!page) { 1026 this.log("No body found in document. Abort."); 1027 return null; 1028 } 1029 1030 var pageCacheHtml = page.innerHTML; 1031 1032 while (true) { 1033 this.log("Starting grabArticle loop"); 1034 var stripUnlikelyCandidates = this._flagIsActive( 1035 this.FLAG_STRIP_UNLIKELYS, 1036 ); 1037 1038 // First, node prepping. Trash nodes that look cruddy (like ones with the 1039 // class name "comment", etc), and turn divs into P tags where they have been 1040 // used inappropriately (as in, where they contain no other block level elements.) 1041 var elementsToScore = []; 1042 var node = this._doc.documentElement; 1043 1044 let shouldRemoveTitleHeader = true; 1045 1046 while (node) { 1047 if (node.tagName === "HTML") { 1048 this._articleLang = node.getAttribute("lang"); 1049 } 1050 1051 var matchString = node.className + " " + node.id; 1052 1053 if (!this._isProbablyVisible(node)) { 1054 this.log("Removing hidden node - " + matchString); 1055 node = this._removeAndGetNext(node); 1056 continue; 1057 } 1058 1059 // Check to see if this node is a byline, and remove it if it is. 1060 if (this._checkByline(node, matchString)) { 1061 node = this._removeAndGetNext(node); 1062 continue; 1063 } 1064 1065 if (shouldRemoveTitleHeader && this._headerDuplicatesTitle(node)) { 1066 this.log( 1067 "Removing header: ", 1068 node.textContent.trim(), 1069 this._articleTitle.trim(), 1070 ); 1071 shouldRemoveTitleHeader = false; 1072 node = this._removeAndGetNext(node); 1073 continue; 1074 } 1075 1076 // Remove unlikely candidates 1077 if (stripUnlikelyCandidates) { 1078 if ( 1079 this.REGEXPS.unlikelyCandidates.test(matchString) && 1080 !this.REGEXPS.okMaybeItsACandidate.test(matchString) && 1081 !this._hasAncestorTag(node, "table") && 1082 !this._hasAncestorTag(node, "code") && 1083 node.tagName !== "BODY" && 1084 node.tagName !== "A" 1085 ) { 1086 this.log("Removing unlikely candidate - " + matchString); 1087 node = this._removeAndGetNext(node); 1088 continue; 1089 } 1090 1091 if (this.UNLIKELY_ROLES.includes(node.getAttribute("role"))) { 1092 this.log( 1093 "Removing content with role " + 1094 node.getAttribute("role") + 1095 " - " + 1096 matchString, 1097 ); 1098 node = this._removeAndGetNext(node); 1099 continue; 1100 } 1101 } 1102 1103 // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe). 1104 if ( 1105 (node.tagName === "DIV" || 1106 node.tagName === "SECTION" || 1107 node.tagName === "HEADER" || 1108 node.tagName === "H1" || 1109 node.tagName === "H2" || 1110 node.tagName === "H3" || 1111 node.tagName === "H4" || 1112 node.tagName === "H5" || 1113 node.tagName === "H6") && 1114 this._isElementWithoutContent(node) 1115 ) { 1116 node = this._removeAndGetNext(node); 1117 continue; 1118 } 1119 1120 if (this.DEFAULT_TAGS_TO_SCORE.indexOf(node.tagName) !== -1) { 1121 elementsToScore.push(node); 1122 } 1123 1124 // Turn all divs that don't have children block level elements into p's 1125 if (node.tagName === "DIV") { 1126 // Put phrasing content into paragraphs. 1127 var p = null; 1128 var childNode = node.firstChild; 1129 while (childNode) { 1130 var nextSibling = childNode.nextSibling; 1131 if (this._isPhrasingContent(childNode)) { 1132 if (p !== null) { 1133 p.appendChild(childNode); 1134 } else if (!this._isWhitespace(childNode)) { 1135 p = doc.createElement("p"); 1136 node.replaceChild(p, childNode); 1137 p.appendChild(childNode); 1138 } 1139 } else if (p !== null) { 1140 while (p.lastChild && this._isWhitespace(p.lastChild)) { 1141 p.removeChild(p.lastChild); 1142 } 1143 p = null; 1144 } 1145 childNode = nextSibling; 1146 } 1147 1148 // Sites like http://mobile.slate.com encloses each paragraph with a DIV 1149 // element. DIVs with only a P element inside and no text content can be 1150 // safely converted into plain P elements to avoid confusing the scoring 1151 // algorithm with DIVs with are, in practice, paragraphs. 1152 if ( 1153 this._hasSingleTagInsideElement(node, "P") && 1154 this._getLinkDensity(node) < 0.25 1155 ) { 1156 var newNode = node.children[0]; 1157 node.parentNode.replaceChild(newNode, node); 1158 node = newNode; 1159 elementsToScore.push(node); 1160 } else if (!this._hasChildBlockElement(node)) { 1161 node = this._setNodeTag(node, "P"); 1162 elementsToScore.push(node); 1163 } 1164 } 1165 node = this._getNextNode(node); 1166 } 1167 1168 /** 1169 * Loop through all paragraphs, and assign a score to them based on how content-y they look. 1170 * Then add their score to their parent node. 1171 * 1172 * A score is determined by things like number of commas, class names, etc. Maybe eventually link density. 1173 **/ 1174 var candidates = []; 1175 this._forEachNode(elementsToScore, function (elementToScore) { 1176 if ( 1177 !elementToScore.parentNode || 1178 typeof elementToScore.parentNode.tagName === "undefined" 1179 ) 1180 return; 1181 1182 // If this paragraph is less than 25 characters, don't even count it. 1183 var innerText = this._getInnerText(elementToScore); 1184 if (innerText.length < 25) return; 1185 1186 // Exclude nodes with no ancestor. 1187 var ancestors = this._getNodeAncestors(elementToScore, 5); 1188 if (ancestors.length === 0) return; 1189 1190 var contentScore = 0; 1191 1192 // Add a point for the paragraph itself as a base. 1193 contentScore += 1; 1194 1195 // Add points for any commas within this paragraph. 1196 contentScore += innerText.split(",").length; 1197 1198 // For every 100 characters in this paragraph, add another point. Up to 3 points. 1199 contentScore += Math.min(Math.floor(innerText.length / 100), 3); 1200 1201 // Initialize and score ancestors. 1202 this._forEachNode(ancestors, function (ancestor, level) { 1203 if ( 1204 !ancestor.tagName || 1205 !ancestor.parentNode || 1206 typeof ancestor.parentNode.tagName === "undefined" 1207 ) 1208 return; 1209 1210 if (typeof ancestor.readability === "undefined") { 1211 this._initializeNode(ancestor); 1212 candidates.push(ancestor); 1213 } 1214 1215 // Node score divider: 1216 // - parent: 1 (no division) 1217 // - grandparent: 2 1218 // - great grandparent+: ancestor level * 3 1219 if (level === 0) var scoreDivider = 1; 1220 else if (level === 1) scoreDivider = 2; 1221 else scoreDivider = level * 3; 1222 ancestor.readability.contentScore += contentScore / scoreDivider; 1223 }); 1224 }); 1225 1226 // After we've calculated scores, loop through all of the possible 1227 // candidate nodes we found and find the one with the highest score. 1228 var topCandidates = []; 1229 for (var c = 0, cl = candidates.length; c < cl; c += 1) { 1230 var candidate = candidates[c]; 1231 1232 // Scale the final candidates score based on link density. Good content 1233 // should have a relatively small link density (5% or less) and be mostly 1234 // unaffected by this operation. 1235 var candidateScore = 1236 candidate.readability.contentScore * 1237 (1 - this._getLinkDensity(candidate)); 1238 candidate.readability.contentScore = candidateScore; 1239 1240 this.log("Candidate:", candidate, "with score " + candidateScore); 1241 1242 for (var t = 0; t < this._nbTopCandidates; t++) { 1243 var aTopCandidate = topCandidates[t]; 1244 1245 if ( 1246 !aTopCandidate || 1247 candidateScore > aTopCandidate.readability.contentScore 1248 ) { 1249 topCandidates.splice(t, 0, candidate); 1250 if (topCandidates.length > this._nbTopCandidates) 1251 topCandidates.pop(); 1252 break; 1253 } 1254 } 1255 } 1256 1257 var topCandidate = topCandidates[0] || null; 1258 var neededToCreateTopCandidate = false; 1259 var parentOfTopCandidate; 1260 1261 // If we still have no top candidate, just use the body as a last resort. 1262 // We also have to copy the body node so it is something we can modify. 1263 if (topCandidate === null || topCandidate.tagName === "BODY") { 1264 // Move all of the page's children into topCandidate 1265 topCandidate = doc.createElement("DIV"); 1266 neededToCreateTopCandidate = true; 1267 // Move everything (not just elements, also text nodes etc.) into the container 1268 // so we even include text directly in the body: 1269 while (page.firstChild) { 1270 this.log("Moving child out:", page.firstChild); 1271 topCandidate.appendChild(page.firstChild); 1272 } 1273 1274 page.appendChild(topCandidate); 1275 1276 this._initializeNode(topCandidate); 1277 } else if (topCandidate) { 1278 // Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array 1279 // and whose scores are quite closed with current `topCandidate` node. 1280 var alternativeCandidateAncestors = []; 1281 for (var i = 1; i < topCandidates.length; i++) { 1282 if ( 1283 topCandidates[i].readability.contentScore / 1284 topCandidate.readability.contentScore >= 1285 0.75 1286 ) { 1287 alternativeCandidateAncestors.push( 1288 this._getNodeAncestors(topCandidates[i]), 1289 ); 1290 } 1291 } 1292 var MINIMUM_TOPCANDIDATES = 3; 1293 if (alternativeCandidateAncestors.length >= MINIMUM_TOPCANDIDATES) { 1294 parentOfTopCandidate = topCandidate.parentNode; 1295 while (parentOfTopCandidate.tagName !== "BODY") { 1296 var listsContainingThisAncestor = 0; 1297 for ( 1298 var ancestorIndex = 0; 1299 ancestorIndex < alternativeCandidateAncestors.length && 1300 listsContainingThisAncestor < MINIMUM_TOPCANDIDATES; 1301 ancestorIndex++ 1302 ) { 1303 listsContainingThisAncestor += Number( 1304 alternativeCandidateAncestors[ancestorIndex].includes( 1305 parentOfTopCandidate, 1306 ), 1307 ); 1308 } 1309 if (listsContainingThisAncestor >= MINIMUM_TOPCANDIDATES) { 1310 topCandidate = parentOfTopCandidate; 1311 break; 1312 } 1313 parentOfTopCandidate = parentOfTopCandidate.parentNode; 1314 } 1315 } 1316 if (!topCandidate.readability) { 1317 this._initializeNode(topCandidate); 1318 } 1319 1320 // Because of our bonus system, parents of candidates might have scores 1321 // themselves. They get half of the node. There won't be nodes with higher 1322 // scores than our topCandidate, but if we see the score going *up* in the first 1323 // few steps up the tree, that's a decent sign that there might be more content 1324 // lurking in other places that we want to unify in. The sibling stuff 1325 // below does some of that - but only if we've looked high enough up the DOM 1326 // tree. 1327 parentOfTopCandidate = topCandidate.parentNode; 1328 var lastScore = topCandidate.readability.contentScore; 1329 // The scores shouldn't get too low. 1330 var scoreThreshold = lastScore / 3; 1331 while (parentOfTopCandidate.tagName !== "BODY") { 1332 if (!parentOfTopCandidate.readability) { 1333 parentOfTopCandidate = parentOfTopCandidate.parentNode; 1334 continue; 1335 } 1336 var parentScore = parentOfTopCandidate.readability.contentScore; 1337 if (parentScore < scoreThreshold) break; 1338 if (parentScore > lastScore) { 1339 // Alright! We found a better parent to use. 1340 topCandidate = parentOfTopCandidate; 1341 break; 1342 } 1343 lastScore = parentOfTopCandidate.readability.contentScore; 1344 parentOfTopCandidate = parentOfTopCandidate.parentNode; 1345 } 1346 1347 // If the top candidate is the only child, use parent instead. This will help sibling 1348 // joining logic when adjacent content is actually located in parent's sibling node. 1349 parentOfTopCandidate = topCandidate.parentNode; 1350 while ( 1351 parentOfTopCandidate.tagName != "BODY" && 1352 parentOfTopCandidate.children.length == 1 1353 ) { 1354 topCandidate = parentOfTopCandidate; 1355 parentOfTopCandidate = topCandidate.parentNode; 1356 } 1357 if (!topCandidate.readability) { 1358 this._initializeNode(topCandidate); 1359 } 1360 } 1361 1362 // Now that we have the top candidate, look through its siblings for content 1363 // that might also be related. Things like preambles, content split by ads 1364 // that we removed, etc. 1365 var articleContent = doc.createElement("DIV"); 1366 if (isPaging) articleContent.id = "readability-content"; 1367 1368 var siblingScoreThreshold = Math.max( 1369 10, 1370 topCandidate.readability.contentScore * 0.2, 1371 ); 1372 // Keep potential top candidate's parent node to try to get text direction of it later. 1373 parentOfTopCandidate = topCandidate.parentNode; 1374 var siblings = parentOfTopCandidate.children; 1375 1376 for (var s = 0, sl = siblings.length; s < sl; s++) { 1377 var sibling = siblings[s]; 1378 var append = false; 1379 1380 this.log( 1381 "Looking at sibling node:", 1382 sibling, 1383 sibling.readability 1384 ? "with score " + sibling.readability.contentScore 1385 : "", 1386 ); 1387 this.log( 1388 "Sibling has score", 1389 sibling.readability ? sibling.readability.contentScore : "Unknown", 1390 ); 1391 1392 if (sibling === topCandidate) { 1393 append = true; 1394 } else { 1395 var contentBonus = 0; 1396 1397 // Give a bonus if sibling nodes and top candidates have the example same classname 1398 if ( 1399 sibling.className === topCandidate.className && 1400 topCandidate.className !== "" 1401 ) 1402 contentBonus += topCandidate.readability.contentScore * 0.2; 1403 1404 if ( 1405 sibling.readability && 1406 sibling.readability.contentScore + contentBonus >= 1407 siblingScoreThreshold 1408 ) { 1409 append = true; 1410 } else if (sibling.nodeName === "P") { 1411 var linkDensity = this._getLinkDensity(sibling); 1412 var nodeContent = this._getInnerText(sibling); 1413 var nodeLength = nodeContent.length; 1414 1415 if (nodeLength > 80 && linkDensity < 0.25) { 1416 append = true; 1417 } else if ( 1418 nodeLength < 80 && 1419 nodeLength > 0 && 1420 linkDensity === 0 && 1421 nodeContent.search(/\.( |$)/) !== -1 1422 ) { 1423 append = true; 1424 } 1425 } 1426 } 1427 1428 if (append) { 1429 this.log("Appending node:", sibling); 1430 1431 if (this.ALTER_TO_DIV_EXCEPTIONS.indexOf(sibling.nodeName) === -1) { 1432 // We have a node that isn't a common block level element, like a form or td tag. 1433 // Turn it into a div so it doesn't get filtered out later by accident. 1434 this.log("Altering sibling:", sibling, "to div."); 1435 1436 sibling = this._setNodeTag(sibling, "DIV"); 1437 } 1438 1439 articleContent.appendChild(sibling); 1440 // Fetch children again to make it compatible 1441 // with DOM parsers without live collection support. 1442 siblings = parentOfTopCandidate.children; 1443 // siblings is a reference to the children array, and 1444 // sibling is removed from the array when we call appendChild(). 1445 // As a result, we must revisit this index since the nodes 1446 // have been shifted. 1447 s -= 1; 1448 sl -= 1; 1449 } 1450 } 1451 1452 if (this._debug) 1453 this.log("Article content pre-prep: " + articleContent.innerHTML); 1454 // So we have all of the content that we need. Now we clean it up for presentation. 1455 this._prepArticle(articleContent); 1456 if (this._debug) 1457 this.log("Article content post-prep: " + articleContent.innerHTML); 1458 1459 if (neededToCreateTopCandidate) { 1460 // We already created a fake div thing, and there wouldn't have been any siblings left 1461 // for the previous loop, so there's no point trying to create a new div, and then 1462 // move all the children over. Just assign IDs and class names here. No need to append 1463 // because that already happened anyway. 1464 topCandidate.id = "readability-page-1"; 1465 topCandidate.className = "page"; 1466 } else { 1467 var div = doc.createElement("DIV"); 1468 div.id = "readability-page-1"; 1469 div.className = "page"; 1470 while (articleContent.firstChild) { 1471 div.appendChild(articleContent.firstChild); 1472 } 1473 articleContent.appendChild(div); 1474 } 1475 1476 if (this._debug) 1477 this.log("Article content after paging: " + articleContent.innerHTML); 1478 1479 var parseSuccessful = true; 1480 1481 // Now that we've gone through the full algorithm, check to see if 1482 // we got any meaningful content. If we didn't, we may need to re-run 1483 // grabArticle with different flags set. This gives us a higher likelihood of 1484 // finding the content, and the sieve approach gives us a higher likelihood of 1485 // finding the -right- content. 1486 var textLength = this._getInnerText(articleContent, true).length; 1487 if (textLength < this._charThreshold) { 1488 parseSuccessful = false; 1489 page.innerHTML = pageCacheHtml; 1490 1491 if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) { 1492 this._removeFlag(this.FLAG_STRIP_UNLIKELYS); 1493 this._attempts.push({ 1494 articleContent: articleContent, 1495 textLength: textLength, 1496 }); 1497 } else if (this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) { 1498 this._removeFlag(this.FLAG_WEIGHT_CLASSES); 1499 this._attempts.push({ 1500 articleContent: articleContent, 1501 textLength: textLength, 1502 }); 1503 } else if (this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) { 1504 this._removeFlag(this.FLAG_CLEAN_CONDITIONALLY); 1505 this._attempts.push({ 1506 articleContent: articleContent, 1507 textLength: textLength, 1508 }); 1509 } else { 1510 this._attempts.push({ 1511 articleContent: articleContent, 1512 textLength: textLength, 1513 }); 1514 // No luck after removing flags, just return the longest text we found during the different loops 1515 this._attempts.sort(function (a, b) { 1516 return b.textLength - a.textLength; 1517 }); 1518 1519 // But first check if we actually have something 1520 if (!this._attempts[0].textLength) { 1521 return null; 1522 } 1523 1524 articleContent = this._attempts[0].articleContent; 1525 parseSuccessful = true; 1526 } 1527 } 1528 1529 if (parseSuccessful) { 1530 // Find out text direction from ancestors of final top candidate. 1531 var ancestors = [parentOfTopCandidate, topCandidate].concat( 1532 this._getNodeAncestors(parentOfTopCandidate), 1533 ); 1534 this._someNode(ancestors, function (ancestor) { 1535 if (!ancestor.tagName) return false; 1536 var articleDir = ancestor.getAttribute("dir"); 1537 if (articleDir) { 1538 this._articleDir = articleDir; 1539 return true; 1540 } 1541 return false; 1542 }); 1543 return articleContent; 1544 } 1545 } 1546 }, 1547 1548 /** 1549 * Check whether the input string could be a byline. 1550 * This verifies that the input is a string, and that the length 1551 * is less than 100 chars. 1552 * 1553 * @param possibleByline {string} - a string to check whether its a byline. 1554 * @return Boolean - whether the input string is a byline. 1555 */ 1556 _isValidByline: function (byline) { 1557 if (typeof byline == "string" || byline instanceof String) { 1558 byline = byline.trim(); 1559 return byline.length > 0 && byline.length < 100; 1560 } 1561 return false; 1562 }, 1563 1564 /** 1565 * Converts some of the common HTML entities in string to their corresponding characters. 1566 * 1567 * @param str {string} - a string to unescape. 1568 * @return string without HTML entity. 1569 */ 1570 _unescapeHtmlEntities: function (str) { 1571 if (!str) { 1572 return str; 1573 } 1574 1575 var htmlEscapeMap = this.HTML_ESCAPE_MAP; 1576 return str 1577 .replace(/&(quot|amp|apos|lt|gt);/g, function (_, tag) { 1578 return htmlEscapeMap[tag]; 1579 }) 1580 .replace( 1581 /&#(?:x([0-9a-z]{1,4})|([0-9]{1,4}));/gi, 1582 function (_, hex, numStr) { 1583 var num = parseInt(hex || numStr, hex ? 16 : 10); 1584 return String.fromCharCode(num); 1585 }, 1586 ); 1587 }, 1588 1589 /** 1590 * Try to extract metadata from JSON-LD object. 1591 * For now, only Schema.org objects of type Article or its subtypes are supported. 1592 * @return Object with any metadata that could be extracted (possibly none) 1593 */ 1594 _getJSONLD: function (doc) { 1595 var scripts = this._getAllNodesWithTag(doc, ["script"]); 1596 1597 var metadata; 1598 1599 this._forEachNode(scripts, function (jsonLdElement) { 1600 if ( 1601 !metadata && 1602 jsonLdElement.getAttribute("type") === "application/ld+json" 1603 ) { 1604 try { 1605 // Strip CDATA markers if present 1606 var content = jsonLdElement.textContent.replace( 1607 /^\s*<!\[CDATA\[|\]\]>\s*$/g, 1608 "", 1609 ); 1610 var parsed = JSON.parse(content); 1611 if ( 1612 !parsed["@context"] || 1613 !parsed["@context"].match(/^https?\:\/\/schema\.org$/) 1614 ) { 1615 return; 1616 } 1617 1618 if (!parsed["@type"] && Array.isArray(parsed["@graph"])) { 1619 parsed = parsed["@graph"].find(function (it) { 1620 return (it["@type"] || "").match(this.REGEXPS.jsonLdArticleTypes); 1621 }); 1622 } 1623 1624 if ( 1625 !parsed || 1626 !parsed["@type"] || 1627 !parsed["@type"].match(this.REGEXPS.jsonLdArticleTypes) 1628 ) { 1629 return; 1630 } 1631 1632 metadata = {}; 1633 1634 if ( 1635 typeof parsed.name === "string" && 1636 typeof parsed.headline === "string" && 1637 parsed.name !== parsed.headline 1638 ) { 1639 // we have both name and headline element in the JSON-LD. They should both be the same but some websites like aktualne.cz 1640 // put their own name into "name" and the article title to "headline" which confuses Readability. So we try to check if either 1641 // "name" or "headline" closely matches the html title, and if so, use that one. If not, then we use "name" by default. 1642 1643 var title = this._getArticleTitle(); 1644 var nameMatches = this._textSimilarity(parsed.name, title) > 0.75; 1645 var headlineMatches = 1646 this._textSimilarity(parsed.headline, title) > 0.75; 1647 1648 if (headlineMatches && !nameMatches) { 1649 metadata.title = parsed.headline; 1650 } else { 1651 metadata.title = parsed.name; 1652 } 1653 } else if (typeof parsed.name === "string") { 1654 metadata.title = parsed.name.trim(); 1655 } else if (typeof parsed.headline === "string") { 1656 metadata.title = parsed.headline.trim(); 1657 } 1658 if (parsed.author) { 1659 if (typeof parsed.author.name === "string") { 1660 metadata.byline = parsed.author.name.trim(); 1661 } else if ( 1662 Array.isArray(parsed.author) && 1663 parsed.author[0] && 1664 typeof parsed.author[0].name === "string" 1665 ) { 1666 metadata.byline = parsed.author 1667 .filter(function (author) { 1668 return author && typeof author.name === "string"; 1669 }) 1670 .map(function (author) { 1671 return author.name.trim(); 1672 }) 1673 .join(", "); 1674 } 1675 } 1676 if (typeof parsed.description === "string") { 1677 metadata.excerpt = parsed.description.trim(); 1678 } 1679 if (parsed.publisher && typeof parsed.publisher.name === "string") { 1680 metadata.siteName = parsed.publisher.name.trim(); 1681 } 1682 return; 1683 } catch (err) { 1684 this.log(err.message); 1685 } 1686 } 1687 }); 1688 return metadata ? metadata : {}; 1689 }, 1690 1691 /** 1692 * Attempts to get excerpt and byline metadata for the article. 1693 * 1694 * @param {Object} jsonld — object containing any metadata that 1695 * could be extracted from JSON-LD object. 1696 * 1697 * @return Object with optional "excerpt" and "byline" properties 1698 */ 1699 _getArticleMetadata: function (jsonld) { 1700 var metadata = {}; 1701 var values = {}; 1702 var metaElements = this._doc.getElementsByTagName("meta"); 1703 1704 // property is a space-separated list of values 1705 var propertyPattern = 1706 /\s*(dc|dcterm|og|twitter)\s*:\s*(author|creator|description|title|site_name)\s*/gi; 1707 1708 // name is a single value 1709 var namePattern = 1710 /^\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|site_name)\s*$/i; 1711 1712 // Find description tags. 1713 this._forEachNode(metaElements, function (element) { 1714 var elementName = element.getAttribute("name"); 1715 var elementProperty = element.getAttribute("property"); 1716 var content = element.getAttribute("content"); 1717 if (!content) { 1718 return; 1719 } 1720 var matches = null; 1721 var name = null; 1722 1723 if (elementProperty) { 1724 matches = elementProperty.match(propertyPattern); 1725 if (matches) { 1726 // Convert to lowercase, and remove any whitespace 1727 // so we can match below. 1728 name = matches[0].toLowerCase().replace(/\s/g, ""); 1729 // multiple authors 1730 values[name] = content.trim(); 1731 } 1732 } 1733 if (!matches && elementName && namePattern.test(elementName)) { 1734 name = elementName; 1735 if (content) { 1736 // Convert to lowercase, remove any whitespace, and convert dots 1737 // to colons so we can match below. 1738 name = name.toLowerCase().replace(/\s/g, "").replace(/\./g, ":"); 1739 values[name] = content.trim(); 1740 } 1741 } 1742 }); 1743 1744 // get title 1745 metadata.title = 1746 jsonld.title || 1747 values["dc:title"] || 1748 values["dcterm:title"] || 1749 values["og:title"] || 1750 values["weibo:article:title"] || 1751 values["weibo:webpage:title"] || 1752 values["title"] || 1753 values["twitter:title"]; 1754 1755 if (!metadata.title) { 1756 metadata.title = this._getArticleTitle(); 1757 } 1758 1759 // get author 1760 metadata.byline = 1761 jsonld.byline || 1762 values["dc:creator"] || 1763 values["dcterm:creator"] || 1764 values["author"]; 1765 1766 // get description 1767 metadata.excerpt = 1768 jsonld.excerpt || 1769 values["dc:description"] || 1770 values["dcterm:description"] || 1771 values["og:description"] || 1772 values["weibo:article:description"] || 1773 values["weibo:webpage:description"] || 1774 values["description"] || 1775 values["twitter:description"]; 1776 1777 // get site name 1778 metadata.siteName = jsonld.siteName || values["og:site_name"]; 1779 1780 // in many sites the meta value is escaped with HTML entities, 1781 // so here we need to unescape it 1782 metadata.title = this._unescapeHtmlEntities(metadata.title); 1783 metadata.byline = this._unescapeHtmlEntities(metadata.byline); 1784 metadata.excerpt = this._unescapeHtmlEntities(metadata.excerpt); 1785 metadata.siteName = this._unescapeHtmlEntities(metadata.siteName); 1786 1787 return metadata; 1788 }, 1789 1790 /** 1791 * Check if node is image, or if node contains exactly only one image 1792 * whether as a direct child or as its descendants. 1793 * 1794 * @param Element 1795 **/ 1796 _isSingleImage: function (node) { 1797 if (node.tagName === "IMG") { 1798 return true; 1799 } 1800 1801 if (node.children.length !== 1 || node.textContent.trim() !== "") { 1802 return false; 1803 } 1804 1805 return this._isSingleImage(node.children[0]); 1806 }, 1807 1808 /** 1809 * Find all <noscript> that are located after <img> nodes, and which contain only one 1810 * <img> element. Replace the first image with the image from inside the <noscript> tag, 1811 * and remove the <noscript> tag. This improves the quality of the images we use on 1812 * some sites (e.g. Medium). 1813 * 1814 * @param Element 1815 **/ 1816 _unwrapNoscriptImages: function (doc) { 1817 // Find img without source or attributes that might contains image, and remove it. 1818 // This is done to prevent a placeholder img is replaced by img from noscript in next step. 1819 var imgs = Array.from(doc.getElementsByTagName("img")); 1820 this._forEachNode(imgs, function (img) { 1821 for (var i = 0; i < img.attributes.length; i++) { 1822 var attr = img.attributes[i]; 1823 switch (attr.name) { 1824 case "src": 1825 case "srcset": 1826 case "data-src": 1827 case "data-srcset": 1828 return; 1829 } 1830 1831 if (/\.(jpg|jpeg|png|webp)/i.test(attr.value)) { 1832 return; 1833 } 1834 } 1835 1836 img.parentNode.removeChild(img); 1837 }); 1838 1839 // Next find noscript and try to extract its image 1840 var noscripts = Array.from(doc.getElementsByTagName("noscript")); 1841 this._forEachNode(noscripts, function (noscript) { 1842 // Parse content of noscript and make sure it only contains image 1843 var tmp = doc.createElement("div"); 1844 tmp.innerHTML = noscript.innerHTML; 1845 if (!this._isSingleImage(tmp)) { 1846 return; 1847 } 1848 1849 // If noscript has previous sibling and it only contains image, 1850 // replace it with noscript content. However we also keep old 1851 // attributes that might contains image. 1852 var prevElement = noscript.previousElementSibling; 1853 if (prevElement && this._isSingleImage(prevElement)) { 1854 var prevImg = prevElement; 1855 if (prevImg.tagName !== "IMG") { 1856 prevImg = prevElement.getElementsByTagName("img")[0]; 1857 } 1858 1859 var newImg = tmp.getElementsByTagName("img")[0]; 1860 for (var i = 0; i < prevImg.attributes.length; i++) { 1861 var attr = prevImg.attributes[i]; 1862 if (attr.value === "") { 1863 continue; 1864 } 1865 1866 if ( 1867 attr.name === "src" || 1868 attr.name === "srcset" || 1869 /\.(jpg|jpeg|png|webp)/i.test(attr.value) 1870 ) { 1871 if (newImg.getAttribute(attr.name) === attr.value) { 1872 continue; 1873 } 1874 1875 var attrName = attr.name; 1876 if (newImg.hasAttribute(attrName)) { 1877 attrName = "data-old-" + attrName; 1878 } 1879 1880 newImg.setAttribute(attrName, attr.value); 1881 } 1882 } 1883 1884 noscript.parentNode.replaceChild(tmp.firstElementChild, prevElement); 1885 } 1886 }); 1887 }, 1888 1889 /** 1890 * Removes script tags from the document. 1891 * 1892 * @param Element 1893 **/ 1894 _removeScripts: function (doc) { 1895 this._removeNodes( 1896 this._getAllNodesWithTag(doc, ["script"]), 1897 function (scriptNode) { 1898 scriptNode.nodeValue = ""; 1899 scriptNode.removeAttribute("src"); 1900 return true; 1901 }, 1902 ); 1903 this._removeNodes(this._getAllNodesWithTag(doc, ["noscript"])); 1904 }, 1905 1906 /** 1907 * Check if this node has only whitespace and a single element with given tag 1908 * Returns false if the DIV node contains non-empty text nodes 1909 * or if it contains no element with given tag or more than 1 element. 1910 * 1911 * @param Element 1912 * @param string tag of child element 1913 **/ 1914 _hasSingleTagInsideElement: function (element, tag) { 1915 // There should be exactly 1 element child with given tag 1916 if (element.children.length != 1 || element.children[0].tagName !== tag) { 1917 return false; 1918 } 1919 1920 // And there should be no text nodes with real content 1921 return !this._someNode(element.childNodes, function (node) { 1922 return ( 1923 node.nodeType === this.TEXT_NODE && 1924 this.REGEXPS.hasContent.test(node.textContent) 1925 ); 1926 }); 1927 }, 1928 1929 _isElementWithoutContent: function (node) { 1930 return ( 1931 node.nodeType === this.ELEMENT_NODE && 1932 node.textContent.trim().length == 0 && 1933 (node.children.length == 0 || 1934 node.children.length == 1935 node.getElementsByTagName("br").length + 1936 node.getElementsByTagName("hr").length) 1937 ); 1938 }, 1939 1940 /** 1941 * Determine whether element has any children block level elements. 1942 * 1943 * @param Element 1944 */ 1945 _hasChildBlockElement: function (element) { 1946 return this._someNode(element.childNodes, function (node) { 1947 return ( 1948 this.DIV_TO_P_ELEMS.has(node.tagName) || 1949 this._hasChildBlockElement(node) 1950 ); 1951 }); 1952 }, 1953 1954 /*** 1955 * Determine if a node qualifies as phrasing content. 1956 * https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content 1957 **/ 1958 _isPhrasingContent: function (node) { 1959 return ( 1960 node.nodeType === this.TEXT_NODE || 1961 this.PHRASING_ELEMS.indexOf(node.tagName) !== -1 || 1962 ((node.tagName === "A" || 1963 node.tagName === "DEL" || 1964 node.tagName === "INS") && 1965 this._everyNode(node.childNodes, this._isPhrasingContent)) 1966 ); 1967 }, 1968 1969 _isWhitespace: function (node) { 1970 return ( 1971 (node.nodeType === this.TEXT_NODE && 1972 node.textContent.trim().length === 0) || 1973 (node.nodeType === this.ELEMENT_NODE && node.tagName === "BR") 1974 ); 1975 }, 1976 1977 /** 1978 * Get the inner text of a node - cross browser compatibly. 1979 * This also strips out any excess whitespace to be found. 1980 * 1981 * @param Element 1982 * @param Boolean normalizeSpaces (default: true) 1983 * @return string 1984 **/ 1985 _getInnerText: function (e, normalizeSpaces) { 1986 normalizeSpaces = 1987 typeof normalizeSpaces === "undefined" ? true : normalizeSpaces; 1988 var textContent = e.textContent.trim(); 1989 1990 if (normalizeSpaces) { 1991 return textContent.replace(this.REGEXPS.normalize, " "); 1992 } 1993 return textContent; 1994 }, 1995 1996 /** 1997 * Get the number of times a string s appears in the node e. 1998 * 1999 * @param Element 2000 * @param string - what to split on. Default is "," 2001 * @return number (integer) 2002 **/ 2003 _getCharCount: function (e, s) { 2004 s = s || ","; 2005 return this._getInnerText(e).split(s).length - 1; 2006 }, 2007 2008 /** 2009 * Remove the style attribute on every e and under. 2010 * TODO: Test if getElementsByTagName(*) is faster. 2011 * 2012 * @param Element 2013 * @return void 2014 **/ 2015 _cleanStyles: function (e) { 2016 if (!e || e.tagName.toLowerCase() === "svg") return; 2017 2018 // Remove `style` and deprecated presentational attributes 2019 for (var i = 0; i < this.PRESENTATIONAL_ATTRIBUTES.length; i++) { 2020 e.removeAttribute(this.PRESENTATIONAL_ATTRIBUTES[i]); 2021 } 2022 2023 if (this.DEPRECATED_SIZE_ATTRIBUTE_ELEMS.indexOf(e.tagName) !== -1) { 2024 e.removeAttribute("width"); 2025 e.removeAttribute("height"); 2026 } 2027 2028 var cur = e.firstElementChild; 2029 while (cur !== null) { 2030 this._cleanStyles(cur); 2031 cur = cur.nextElementSibling; 2032 } 2033 }, 2034 2035 /** 2036 * Get the density of links as a percentage of the content 2037 * This is the amount of text that is inside a link divided by the total text in the node. 2038 * 2039 * @param Element 2040 * @return number (float) 2041 **/ 2042 _getLinkDensity: function (element) { 2043 var textLength = this._getInnerText(element).length; 2044 if (textLength === 0) return 0; 2045 2046 var linkLength = 0; 2047 2048 // XXX implement _reduceNodeList? 2049 this._forEachNode(element.getElementsByTagName("a"), function (linkNode) { 2050 var href = linkNode.getAttribute("href"); 2051 var coefficient = href && this.REGEXPS.hashUrl.test(href) ? 0.3 : 1; 2052 linkLength += this._getInnerText(linkNode).length * coefficient; 2053 }); 2054 2055 return linkLength / textLength; 2056 }, 2057 2058 /** 2059 * Get an elements class/id weight. Uses regular expressions to tell if this 2060 * element looks good or bad. 2061 * 2062 * @param Element 2063 * @return number (Integer) 2064 **/ 2065 _getClassWeight: function (e) { 2066 if (!this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) return 0; 2067 2068 var weight = 0; 2069 2070 // Look for a special classname 2071 if (typeof e.className === "string" && e.className !== "") { 2072 if (this.REGEXPS.negative.test(e.className)) weight -= 25; 2073 2074 if (this.REGEXPS.positive.test(e.className)) weight += 25; 2075 } 2076 2077 // Look for a special ID 2078 if (typeof e.id === "string" && e.id !== "") { 2079 if (this.REGEXPS.negative.test(e.id)) weight -= 25; 2080 2081 if (this.REGEXPS.positive.test(e.id)) weight += 25; 2082 } 2083 2084 return weight; 2085 }, 2086 2087 /** 2088 * Clean a node of all elements of type "tag". 2089 * (Unless it's a youtube/vimeo video. People love movies.) 2090 * 2091 * @param Element 2092 * @param string tag to clean 2093 * @return void 2094 **/ 2095 _clean: function (e, tag) { 2096 var isEmbed = ["object", "embed", "iframe"].indexOf(tag) !== -1; 2097 2098 this._removeNodes(this._getAllNodesWithTag(e, [tag]), function (element) { 2099 // Allow youtube and vimeo videos through as people usually want to see those. 2100 if (isEmbed) { 2101 // First, check the elements attributes to see if any of them contain youtube or vimeo 2102 for (var i = 0; i < element.attributes.length; i++) { 2103 if (this.REGEXPS.videos.test(element.attributes[i].value)) { 2104 return false; 2105 } 2106 } 2107 2108 // For embed with <object> tag, check inner HTML as well. 2109 if ( 2110 element.tagName === "object" && 2111 this.REGEXPS.videos.test(element.innerHTML) 2112 ) { 2113 return false; 2114 } 2115 } 2116 2117 return true; 2118 }); 2119 }, 2120 2121 /** 2122 * Check if a given node has one of its ancestor tag name matching the 2123 * provided one. 2124 * @param HTMLElement node 2125 * @param String tagName 2126 * @param Number maxDepth 2127 * @param Function filterFn a filter to invoke to determine whether this node 'counts' 2128 * @return Boolean 2129 */ 2130 _hasAncestorTag: function (node, tagName, maxDepth, filterFn) { 2131 maxDepth = maxDepth || 3; 2132 tagName = tagName.toUpperCase(); 2133 var depth = 0; 2134 while (node.parentNode) { 2135 if (maxDepth > 0 && depth > maxDepth) return false; 2136 if ( 2137 node.parentNode.tagName === tagName && 2138 (!filterFn || filterFn(node.parentNode)) 2139 ) 2140 return true; 2141 node = node.parentNode; 2142 depth++; 2143 } 2144 return false; 2145 }, 2146 2147 /** 2148 * Return an object indicating how many rows and columns this table has. 2149 */ 2150 _getRowAndColumnCount: function (table) { 2151 var rows = 0; 2152 var columns = 0; 2153 var trs = table.getElementsByTagName("tr"); 2154 for (var i = 0; i < trs.length; i++) { 2155 var rowspan = trs[i].getAttribute("rowspan") || 0; 2156 if (rowspan) { 2157 rowspan = parseInt(rowspan, 10); 2158 } 2159 rows += rowspan || 1; 2160 2161 // Now look for column-related info 2162 var columnsInThisRow = 0; 2163 var cells = trs[i].getElementsByTagName("td"); 2164 for (var j = 0; j < cells.length; j++) { 2165 var colspan = cells[j].getAttribute("colspan") || 0; 2166 if (colspan) { 2167 colspan = parseInt(colspan, 10); 2168 } 2169 columnsInThisRow += colspan || 1; 2170 } 2171 columns = Math.max(columns, columnsInThisRow); 2172 } 2173 return { rows: rows, columns: columns }; 2174 }, 2175 2176 /** 2177 * Look for 'data' (as opposed to 'layout') tables, for which we use 2178 * similar checks as 2179 * https://searchfox.org/mozilla-central/rev/f82d5c549f046cb64ce5602bfd894b7ae807c8f8/accessible/generic/TableAccessible.cpp#19 2180 */ 2181 _markDataTables: function (root) { 2182 var tables = root.getElementsByTagName("table"); 2183 for (var i = 0; i < tables.length; i++) { 2184 var table = tables[i]; 2185 var role = table.getAttribute("role"); 2186 if (role == "presentation") { 2187 table._readabilityDataTable = false; 2188 continue; 2189 } 2190 var datatable = table.getAttribute("datatable"); 2191 if (datatable == "0") { 2192 table._readabilityDataTable = false; 2193 continue; 2194 } 2195 var summary = table.getAttribute("summary"); 2196 if (summary) { 2197 table._readabilityDataTable = true; 2198 continue; 2199 } 2200 2201 var caption = table.getElementsByTagName("caption")[0]; 2202 if (caption && caption.childNodes.length > 0) { 2203 table._readabilityDataTable = true; 2204 continue; 2205 } 2206 2207 // If the table has a descendant with any of these tags, consider a data table: 2208 var dataTableDescendants = ["col", "colgroup", "tfoot", "thead", "th"]; 2209 var descendantExists = function (tag) { 2210 return !!table.getElementsByTagName(tag)[0]; 2211 }; 2212 if (dataTableDescendants.some(descendantExists)) { 2213 this.log("Data table because found data-y descendant"); 2214 table._readabilityDataTable = true; 2215 continue; 2216 } 2217 2218 // Nested tables indicate a layout table: 2219 if (table.getElementsByTagName("table")[0]) { 2220 table._readabilityDataTable = false; 2221 continue; 2222 } 2223 2224 var sizeInfo = this._getRowAndColumnCount(table); 2225 if (sizeInfo.rows >= 10 || sizeInfo.columns > 4) { 2226 table._readabilityDataTable = true; 2227 continue; 2228 } 2229 // Now just go by size entirely: 2230 table._readabilityDataTable = sizeInfo.rows * sizeInfo.columns > 10; 2231 } 2232 }, 2233 2234 /* convert images and figures that have properties like data-src into images that can be loaded without JS */ 2235 _fixLazyImages: function (root) { 2236 this._forEachNode( 2237 this._getAllNodesWithTag(root, ["img", "picture", "figure"]), 2238 function (elem) { 2239 // In some sites (e.g. Kotaku), they put 1px square image as base64 data uri in the src attribute. 2240 // So, here we check if the data uri is too short, just might as well remove it. 2241 if (elem.src && this.REGEXPS.b64DataUrl.test(elem.src)) { 2242 // Make sure it's not SVG, because SVG can have a meaningful image in under 133 bytes. 2243 var parts = this.REGEXPS.b64DataUrl.exec(elem.src); 2244 if (parts[1] === "image/svg+xml") { 2245 return; 2246 } 2247 2248 // Make sure this element has other attributes which contains image. 2249 // If it doesn't, then this src is important and shouldn't be removed. 2250 var srcCouldBeRemoved = false; 2251 for (var i = 0; i < elem.attributes.length; i++) { 2252 var attr = elem.attributes[i]; 2253 if (attr.name === "src") { 2254 continue; 2255 } 2256 2257 if (/\.(jpg|jpeg|png|webp)/i.test(attr.value)) { 2258 srcCouldBeRemoved = true; 2259 break; 2260 } 2261 } 2262 2263 // Here we assume if image is less than 100 bytes (or 133B after encoded to base64) 2264 // it will be too small, therefore it might be placeholder image. 2265 if (srcCouldBeRemoved) { 2266 var b64starts = elem.src.search(/base64\s*/i) + 7; 2267 var b64length = elem.src.length - b64starts; 2268 if (b64length < 133) { 2269 elem.removeAttribute("src"); 2270 } 2271 } 2272 } 2273 2274 // also check for "null" to work around https://github.com/jsdom/jsdom/issues/2580 2275 if ( 2276 (elem.src || (elem.srcset && elem.srcset != "null")) && 2277 elem.className.toLowerCase().indexOf("lazy") === -1 2278 ) { 2279 return; 2280 } 2281 2282 for (var j = 0; j < elem.attributes.length; j++) { 2283 attr = elem.attributes[j]; 2284 if ( 2285 attr.name === "src" || 2286 attr.name === "srcset" || 2287 attr.name === "alt" 2288 ) { 2289 continue; 2290 } 2291 var copyTo = null; 2292 if (/\.(jpg|jpeg|png|webp)\s+\d/.test(attr.value)) { 2293 copyTo = "srcset"; 2294 } else if (/^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$/.test(attr.value)) { 2295 copyTo = "src"; 2296 } 2297 if (copyTo) { 2298 //if this is an img or picture, set the attribute directly 2299 if (elem.tagName === "IMG" || elem.tagName === "PICTURE") { 2300 elem.setAttribute(copyTo, attr.value); 2301 } else if ( 2302 elem.tagName === "FIGURE" && 2303 !this._getAllNodesWithTag(elem, ["img", "picture"]).length 2304 ) { 2305 //if the item is a <figure> that does not contain an image or picture, create one and place it inside the figure 2306 //see the nytimes-3 testcase for an example 2307 var img = this._doc.createElement("img"); 2308 img.setAttribute(copyTo, attr.value); 2309 elem.appendChild(img); 2310 } 2311 } 2312 } 2313 }, 2314 ); 2315 }, 2316 2317 _getTextDensity: function (e, tags) { 2318 var textLength = this._getInnerText(e, true).length; 2319 if (textLength === 0) { 2320 return 0; 2321 } 2322 var childrenLength = 0; 2323 var children = this._getAllNodesWithTag(e, tags); 2324 this._forEachNode( 2325 children, 2326 (child) => (childrenLength += this._getInnerText(child, true).length), 2327 ); 2328 return childrenLength / textLength; 2329 }, 2330 2331 /** 2332 * Clean an element of all tags of type "tag" if they look fishy. 2333 * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc. 2334 * 2335 * @return void 2336 **/ 2337 _cleanConditionally: function (e, tag) { 2338 if (!this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) return; 2339 2340 // Gather counts for other typical elements embedded within. 2341 // Traverse backwards so we can remove nodes at the same time 2342 // without effecting the traversal. 2343 // 2344 // TODO: Consider taking into account original contentScore here. 2345 this._removeNodes(this._getAllNodesWithTag(e, [tag]), function (node) { 2346 // First check if this node IS data table, in which case don't remove it. 2347 var isDataTable = function (t) { 2348 return t._readabilityDataTable; 2349 }; 2350 2351 var isList = tag === "ul" || tag === "ol"; 2352 if (!isList) { 2353 var listLength = 0; 2354 var listNodes = this._getAllNodesWithTag(node, ["ul", "ol"]); 2355 this._forEachNode( 2356 listNodes, 2357 (list) => (listLength += this._getInnerText(list).length), 2358 ); 2359 isList = listLength / this._getInnerText(node).length > 0.9; 2360 } 2361 2362 if (tag === "table" && isDataTable(node)) { 2363 return false; 2364 } 2365 2366 // Next check if we're inside a data table, in which case don't remove it as well. 2367 if (this._hasAncestorTag(node, "table", -1, isDataTable)) { 2368 return false; 2369 } 2370 2371 if (this._hasAncestorTag(node, "code")) { 2372 return false; 2373 } 2374 2375 var weight = this._getClassWeight(node); 2376 2377 this.log("Cleaning Conditionally", node); 2378 2379 var contentScore = 0; 2380 2381 if (weight + contentScore < 0) { 2382 return true; 2383 } 2384 2385 if (this._getCharCount(node, ",") < 10) { 2386 // If there are not very many commas, and the number of 2387 // non-paragraph elements is more than paragraphs or other 2388 // ominous signs, remove the element. 2389 var p = node.getElementsByTagName("p").length; 2390 var img = node.getElementsByTagName("img").length; 2391 var li = node.getElementsByTagName("li").length - 100; 2392 var input = node.getElementsByTagName("input").length; 2393 var headingDensity = this._getTextDensity(node, [ 2394 "h1", 2395 "h2", 2396 "h3", 2397 "h4", 2398 "h5", 2399 "h6", 2400 ]); 2401 2402 var embedCount = 0; 2403 var embeds = this._getAllNodesWithTag(node, [ 2404 "object", 2405 "embed", 2406 "iframe", 2407 ]); 2408 2409 for (var i = 0; i < embeds.length; i++) { 2410 // If this embed has attribute that matches video regex, don't delete it. 2411 for (var j = 0; j < embeds[i].attributes.length; j++) { 2412 if (this.REGEXPS.videos.test(embeds[i].attributes[j].value)) { 2413 return false; 2414 } 2415 } 2416 2417 // For embed with <object> tag, check inner HTML as well. 2418 if ( 2419 embeds[i].tagName === "object" && 2420 this.REGEXPS.videos.test(embeds[i].innerHTML) 2421 ) { 2422 return false; 2423 } 2424 2425 embedCount++; 2426 } 2427 2428 var linkDensity = this._getLinkDensity(node); 2429 var contentLength = this._getInnerText(node).length; 2430 2431 var haveToRemove = 2432 (img > 1 && p / img < 0.5 && !this._hasAncestorTag(node, "figure")) || 2433 (!isList && li > p) || 2434 input > Math.floor(p / 3) || 2435 (!isList && 2436 headingDensity < 0.9 && 2437 contentLength < 25 && 2438 (img === 0 || img > 2) && 2439 !this._hasAncestorTag(node, "figure")) || 2440 (!isList && weight < 25 && linkDensity > 0.2) || 2441 (weight >= 25 && linkDensity > 0.5) || 2442 (embedCount === 1 && contentLength < 75) || 2443 embedCount > 1; 2444 // Allow simple lists of images to remain in pages 2445 if (isList && haveToRemove) { 2446 for (var x = 0; x < node.children.length; x++) { 2447 let child = node.children[x]; 2448 // Don't filter in lists with li's that contain more than one child 2449 if (child.children.length > 1) { 2450 return haveToRemove; 2451 } 2452 } 2453 li_count = node.getElementsByTagName("li").length; 2454 // Only allow the list to remain if every li contains an image 2455 if (img == li_count) { 2456 return false; 2457 } 2458 } 2459 return haveToRemove; 2460 } 2461 return false; 2462 }); 2463 }, 2464 2465 /** 2466 * Clean out elements that match the specified conditions 2467 * 2468 * @param Element 2469 * @param Function determines whether a node should be removed 2470 * @return void 2471 **/ 2472 _cleanMatchedNodes: function (e, filter) { 2473 var endOfSearchMarkerNode = this._getNextNode(e, true); 2474 var next = this._getNextNode(e); 2475 while (next && next != endOfSearchMarkerNode) { 2476 if (filter.call(this, next, next.className + " " + next.id)) { 2477 next = this._removeAndGetNext(next); 2478 } else { 2479 next = this._getNextNode(next); 2480 } 2481 } 2482 }, 2483 2484 /** 2485 * Clean out spurious headers from an Element. 2486 * 2487 * @param Element 2488 * @return void 2489 **/ 2490 _cleanHeaders: function (e) { 2491 let headingNodes = this._getAllNodesWithTag(e, ["h1", "h2"]); 2492 this._removeNodes(headingNodes, function (node) { 2493 let shouldRemove = this._getClassWeight(node) < 0; 2494 if (shouldRemove) { 2495 this.log("Removing header with low class weight:", node); 2496 } 2497 return shouldRemove; 2498 }); 2499 }, 2500 2501 /** 2502 * Check if this node is an H1 or H2 element whose content is mostly 2503 * the same as the article title. 2504 * 2505 * @param Element the node to check. 2506 * @return boolean indicating whether this is a title-like header. 2507 */ 2508 _headerDuplicatesTitle: function (node) { 2509 if (node.tagName != "H1" && node.tagName != "H2") { 2510 return false; 2511 } 2512 var heading = this._getInnerText(node, false); 2513 this.log("Evaluating similarity of header:", heading, this._articleTitle); 2514 return this._textSimilarity(this._articleTitle, heading) > 0.75; 2515 }, 2516 2517 _flagIsActive: function (flag) { 2518 return (this._flags & flag) > 0; 2519 }, 2520 2521 _removeFlag: function (flag) { 2522 this._flags = this._flags & ~flag; 2523 }, 2524 2525 _isProbablyVisible: function (node) { 2526 // Have to null-check node.style and node.className.indexOf to deal with SVG and MathML nodes. 2527 return ( 2528 (!node.style || node.style.display != "none") && 2529 !node.hasAttribute("hidden") && 2530 //check for "fallback-image" so that wikimedia math images are displayed 2531 (!node.hasAttribute("aria-hidden") || 2532 node.getAttribute("aria-hidden") != "true" || 2533 (node.className && 2534 node.className.indexOf && 2535 node.className.indexOf("fallback-image") !== -1)) 2536 ); 2537 }, 2538 2539 /** 2540 * Runs readability. 2541 * 2542 * Workflow: 2543 * 1. Prep the document by removing script tags, css, etc. 2544 * 2. Build readability's DOM tree. 2545 * 3. Grab the article content from the current dom tree. 2546 * 4. Replace the current DOM tree with the new one. 2547 * 5. Read peacefully. 2548 * 2549 * @return void 2550 **/ 2551 parse: function () { 2552 // Avoid parsing too large documents, as per configuration option 2553 if (this._maxElemsToParse > 0) { 2554 var numTags = this._doc.getElementsByTagName("*").length; 2555 if (numTags > this._maxElemsToParse) { 2556 throw new Error( 2557 "Aborting parsing document; " + numTags + " elements found", 2558 ); 2559 } 2560 } 2561 2562 // Unwrap image from noscript 2563 this._unwrapNoscriptImages(this._doc); 2564 2565 // Extract JSON-LD metadata before removing scripts 2566 var jsonLd = this._disableJSONLD ? {} : this._getJSONLD(this._doc); 2567 2568 // Remove script tags from the document. 2569 this._removeScripts(this._doc); 2570 2571 this._prepDocument(); 2572 2573 var metadata = this._getArticleMetadata(jsonLd); 2574 this._articleTitle = metadata.title; 2575 2576 var articleContent = this._grabArticle(); 2577 if (!articleContent) return null; 2578 2579 this.log("Grabbed: " + articleContent.innerHTML); 2580 2581 this._postProcessContent(articleContent); 2582 2583 // If we haven't found an excerpt in the article's metadata, use the article's 2584 // first paragraph as the excerpt. This is used for displaying a preview of 2585 // the article's content. 2586 if (!metadata.excerpt) { 2587 var paragraphs = articleContent.getElementsByTagName("p"); 2588 if (paragraphs.length > 0) { 2589 metadata.excerpt = paragraphs[0].textContent.trim(); 2590 } 2591 } 2592 2593 var textContent = articleContent.textContent; 2594 return { 2595 title: this._articleTitle, 2596 byline: metadata.byline || this._articleByline, 2597 dir: this._articleDir, 2598 lang: this._articleLang, 2599 content: this._serializer(articleContent), 2600 textContent: textContent, 2601 length: textContent.length, 2602 excerpt: metadata.excerpt, 2603 siteName: metadata.siteName || this._articleSiteName, 2604 }; 2605 }, 2606 }; 2607 2608 if (typeof module === "object") { 2609 module.exports = Readability; 2610 } 2611 2612 /* Define a css stylesheet */ 2613 2614 var style_sheet_simple = ` 2615 <style type="text/css"> 2616 2617 body { 2618 padding: 40px 200px 40px 200px !important; 2619 font-size: 18px; 2620 font: 18px/1.5 Roboto; 2621 line-height: 1.6; 2622 background-color: #FEFEFE !important; 2623 color: #444 !important; 2624 max-width: 99% !important; 2625 } 2626 2627 #readOverlay { 2628 display: block; 2629 position: absolute; 2630 background-color: white; 2631 top: 0; 2632 left: 0; 2633 width: 100%; 2634 } 2635 2636 /* Block quotes */ 2637 2638 blockquote{ 2639 width:60%; 2640 margin: 5px auto; 2641 font-style:italic; 2642 color: #555555; 2643 padding: 1.2em 30px 1.2em 75px; 2644 border-left:8px solid #005386 ; 2645 line-height:1.3; 2646 position: relative; 2647 background: #F0F0F0; 2648 } 2649 2650 blockquote::before{ 2651 font-family: Arial; 2652 content: "\\201C"; 2653 color: #005386; 2654 font-size: 6em; 2655 position: absolute; 2656 left: 10px; 2657 top:-10px; 2658 2659 } 2660 2661 a[href^="#footnote-"] { 2662 text-decoration: none; 2663 } 2664 a[href^="#footnote-"]::before { 2665 content:" ["; 2666 } 2667 a[href^="#footnote-"]::after { 2668 content:"] "; 2669 } 2670 2671 </style>`; 2672 2673 /* 2674 body { 2675 max-width: 650px; 2676 margin: 40px auto; 2677 padding: 0 10px; 2678 font: 18px/1.5 -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, "Noto Sans", sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol", "Noto Color Emoji"; 2679 color: #444 2680 } 2681 */ 2682 /* See also 2683 * <https://gist.github.com/aanand/399131> 2684 * and the one included in firefox @ <chrome://global/skin/aboutReader.css> 2685 */ 2686 var documentClone = document.cloneNode(true); 2687 var article = new Readability(documentClone).parse(); 2688 document.head.innerHTML = `<title>${article.title}</title>\n${style_sheet_simple}`; 2689 document.body.innerHTML = `<h1>${article.title}</h1>\n${article.content}`; 2690 2691 /* Hack for archive.is */ 2692 var styles = ` 2693 img { 2694 max-width: 80% !important; 2695 height: auto; 2696 display: block; 2697 margin-left: auto; 2698 margin-right: auto; 2699 } 2700 `; 2701 2702 if (document.domain == "archive.is") { 2703 styles += ` 2704 2705 li > span { 2706 display: none !important; 2707 } 2708 2709 /* Matt Levine's Money Stuff specific stuff: */ 2710 2711 p > span > em { 2712 display: none !important; 2713 } 2714 iframe { 2715 display: none; 2716 } 2717 #div[id^='stickypbModal'] { 2718 display: none; 2719 } 2720 `; 2721 } 2722 2723 var styleSheet = document.createElement("style"); 2724 styleSheet.innerText = styles; 2725 document.head.appendChild(styleSheet); 2726 console.log("Style changed");