rosenrot-browser

A hackable browser based on Webkitgtk
Log | Files | Refs | README

readability.js (88097B)


      1 /*eslint-env es6:false*/
      2 /*
      3  * Copyright (c) 2010 Arc90 Inc
      4  *
      5  * Licensed under the Apache License, Version 2.0 (the "License");
      6  * you may not use this file except in compliance with the License.
      7  * You may obtain a copy of the License at
      8  *
      9  *     http://www.apache.org/licenses/LICENSE-2.0
     10  *
     11  * Unless required by applicable law or agreed to in writing, software
     12  * distributed under the License is distributed on an "AS IS" BASIS,
     13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14  * See the License for the specific language governing permissions and
     15  * limitations under the License.
     16  */
     17 
     18 /*
     19  * This code is heavily based on Arc90's readability.js (1.7.1) script
     20  * available at: http://code.google.com/p/arc90labs-readability
     21  */
     22 
     23 /**
     24  * Public constructor.
     25  * @param {HTMLDocument} doc     The document to parse.
     26  * @param {Object}       options The options object.
     27  */
     28 function Readability(doc, options) {
     29   // In some older versions, people passed a URI as the first argument. Cope:
     30   if (options && options.documentElement) {
     31     doc = options;
     32     options = arguments[2];
     33   } else if (!doc || !doc.documentElement) {
     34     throw new Error(
     35       "First argument to Readability constructor should be a document object.",
     36     );
     37   }
     38   options = options || {};
     39 
     40   this._doc = doc;
     41   this._docJSDOMParser = this._doc.firstChild.__JSDOMParser__;
     42   this._articleTitle = null;
     43   this._articleByline = null;
     44   this._articleDir = null;
     45   this._articleSiteName = null;
     46   this._attempts = [];
     47 
     48   // Configurable options
     49   this._debug = !!options.debug;
     50   this._maxElemsToParse =
     51     options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE;
     52   this._nbTopCandidates =
     53     options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES;
     54   this._charThreshold = options.charThreshold || this.DEFAULT_CHAR_THRESHOLD;
     55   this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat(
     56     options.classesToPreserve || [],
     57   );
     58   this._keepClasses = !!options.keepClasses;
     59   this._serializer =
     60     options.serializer ||
     61     function (el) {
     62       return el.innerHTML;
     63     };
     64   this._disableJSONLD = !!options.disableJSONLD;
     65 
     66   // Start with all flags set
     67   this._flags =
     68     this.FLAG_STRIP_UNLIKELYS |
     69     this.FLAG_WEIGHT_CLASSES |
     70     this.FLAG_CLEAN_CONDITIONALLY;
     71 
     72   // Control whether log messages are sent to the console
     73   if (this._debug) {
     74     let logNode = function (node) {
     75       if (node.nodeType == node.TEXT_NODE) {
     76         return `${node.nodeName} ("${node.textContent}")`;
     77       }
     78       let attrPairs = Array.from(node.attributes || [], function (attr) {
     79         return `${attr.name}="${attr.value}"`;
     80       }).join(" ");
     81       return `<${node.localName} ${attrPairs}>`;
     82     };
     83     this.log = function () {
     84       if (typeof dump !== "undefined") {
     85         var msg = Array.prototype.map
     86           .call(arguments, function (x) {
     87             return x && x.nodeName ? logNode(x) : x;
     88           })
     89           .join(" ");
     90         dump("Reader: (Readability) " + msg + "\n");
     91       } else if (typeof console !== "undefined") {
     92         let args = Array.from(arguments, (arg) => {
     93           if (arg && arg.nodeType == this.ELEMENT_NODE) {
     94             return logNode(arg);
     95           }
     96           return arg;
     97         });
     98         args.unshift("Reader: (Readability)");
     99         console.log.apply(console, args);
    100       }
    101     };
    102   } else {
    103     this.log = function () {};
    104   }
    105 }
    106 
    107 Readability.prototype = {
    108   FLAG_STRIP_UNLIKELYS: 0x1,
    109   FLAG_WEIGHT_CLASSES: 0x2,
    110   FLAG_CLEAN_CONDITIONALLY: 0x4,
    111 
    112   // https://developer.mozilla.org/en-US/docs/Web/API/Node/nodeType
    113   ELEMENT_NODE: 1,
    114   TEXT_NODE: 3,
    115 
    116   // Max number of nodes supported by this parser. Default: 0 (no limit)
    117   DEFAULT_MAX_ELEMS_TO_PARSE: 0,
    118 
    119   // The number of top candidates to consider when analysing how
    120   // tight the competition is among candidates.
    121   DEFAULT_N_TOP_CANDIDATES: 5,
    122 
    123   // Element tags to score by default.
    124   DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre"
    125     .toUpperCase()
    126     .split(","),
    127 
    128   // The default number of chars an article must have in order to return a result
    129   DEFAULT_CHAR_THRESHOLD: 500,
    130 
    131   // All of the regular expressions in use within readability.
    132   // Defined up here so we don't instantiate them repeatedly in loops.
    133   REGEXPS: {
    134     // NOTE: These two regular expressions are duplicated in
    135     // Readability-readerable.js. Please keep both copies in sync.
    136     unlikelyCandidates:
    137       /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
    138     okMaybeItsACandidate: /and|article|body|column|content|main|shadow/i,
    139 
    140     positive:
    141       /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,
    142     negative:
    143       /-ad-|hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
    144     extraneous:
    145       /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,
    146     byline: /byline|author|dateline|writtenby|p-author/i,
    147     replaceFonts: /<(\/?)font[^>]*>/gi,
    148     normalize: /\s{2,}/g,
    149     videos:
    150       /\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i,
    151     shareElements: /(\b|_)(share|sharedaddy)(\b|_)/i,
    152     nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,
    153     prevLink: /(prev|earl|old|new|<|«)/i,
    154     tokenize: /\W+/g,
    155     whitespace: /^\s*$/,
    156     hasContent: /\S$/,
    157     hashUrl: /^#.+/,
    158     srcsetUrl: /(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))/g,
    159     b64DataUrl: /^data:\s*([^\s;,]+)\s*;\s*base64\s*,/i,
    160     // See: https://schema.org/Article
    161     jsonLdArticleTypes:
    162       /^Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference$/,
    163   },
    164 
    165   UNLIKELY_ROLES: [
    166     "menu",
    167     "menubar",
    168     "complementary",
    169     "navigation",
    170     "alert",
    171     "alertdialog",
    172     "dialog",
    173   ],
    174 
    175   DIV_TO_P_ELEMS: new Set([
    176     "BLOCKQUOTE",
    177     "DL",
    178     "DIV",
    179     "IMG",
    180     "OL",
    181     "P",
    182     "PRE",
    183     "TABLE",
    184     "UL",
    185   ]),
    186 
    187   ALTER_TO_DIV_EXCEPTIONS: ["DIV", "ARTICLE", "SECTION", "P"],
    188 
    189   PRESENTATIONAL_ATTRIBUTES: [
    190     "align",
    191     "background",
    192     "bgcolor",
    193     "border",
    194     "cellpadding",
    195     "cellspacing",
    196     "frame",
    197     "hspace",
    198     "rules",
    199     "style",
    200     "valign",
    201     "vspace",
    202   ],
    203 
    204   DEPRECATED_SIZE_ATTRIBUTE_ELEMS: ["TABLE", "TH", "TD", "HR", "PRE"],
    205 
    206   // The commented out elements qualify as phrasing content but tend to be
    207   // removed by readability when put into paragraphs, so we ignore them here.
    208   PHRASING_ELEMS: [
    209     // "CANVAS", "IFRAME", "SVG", "VIDEO",
    210     "ABBR",
    211     "AUDIO",
    212     "B",
    213     "BDO",
    214     "BR",
    215     "BUTTON",
    216     "CITE",
    217     "CODE",
    218     "DATA",
    219     "DATALIST",
    220     "DFN",
    221     "EM",
    222     "EMBED",
    223     "I",
    224     "IMG",
    225     "INPUT",
    226     "KBD",
    227     "LABEL",
    228     "MARK",
    229     "MATH",
    230     "METER",
    231     "NOSCRIPT",
    232     "OBJECT",
    233     "OUTPUT",
    234     "PROGRESS",
    235     "Q",
    236     "RUBY",
    237     "SAMP",
    238     "SCRIPT",
    239     "SELECT",
    240     "SMALL",
    241     "SPAN",
    242     "STRONG",
    243     "SUB",
    244     "SUP",
    245     "TEXTAREA",
    246     "TIME",
    247     "VAR",
    248     "WBR",
    249   ],
    250 
    251   // These are the classes that readability sets itself.
    252   CLASSES_TO_PRESERVE: ["page"],
    253 
    254   // These are the list of HTML entities that need to be escaped.
    255   HTML_ESCAPE_MAP: {
    256     lt: "<",
    257     gt: ">",
    258     amp: "&",
    259     quot: '"',
    260     apos: "'",
    261   },
    262 
    263   /**
    264    * Run any post-process modifications to article content as necessary.
    265    *
    266    * @param Element
    267    * @return void
    268    **/
    269   _postProcessContent: function (articleContent) {
    270     // Readability cannot open relative uris so we convert them to absolute uris.
    271     this._fixRelativeUris(articleContent);
    272 
    273     this._simplifyNestedElements(articleContent);
    274 
    275     if (!this._keepClasses) {
    276       // Remove classes.
    277       this._cleanClasses(articleContent);
    278     }
    279   },
    280 
    281   /**
    282    * Iterates over a NodeList, calls `filterFn` for each node and removes node
    283    * if function returned `true`.
    284    *
    285    * If function is not passed, removes all the nodes in node list.
    286    *
    287    * @param NodeList nodeList The nodes to operate on
    288    * @param Function filterFn the function to use as a filter
    289    * @return void
    290    */
    291   _removeNodes: function (nodeList, filterFn) {
    292     // Avoid ever operating on live node lists.
    293     if (this._docJSDOMParser && nodeList._isLiveNodeList) {
    294       throw new Error("Do not pass live node lists to _removeNodes");
    295     }
    296     for (var i = nodeList.length - 1; i >= 0; i--) {
    297       var node = nodeList[i];
    298       var parentNode = node.parentNode;
    299       if (parentNode) {
    300         if (!filterFn || filterFn.call(this, node, i, nodeList)) {
    301           parentNode.removeChild(node);
    302         }
    303       }
    304     }
    305   },
    306 
    307   /**
    308    * Iterates over a NodeList, and calls _setNodeTag for each node.
    309    *
    310    * @param NodeList nodeList The nodes to operate on
    311    * @param String newTagName the new tag name to use
    312    * @return void
    313    */
    314   _replaceNodeTags: function (nodeList, newTagName) {
    315     // Avoid ever operating on live node lists.
    316     if (this._docJSDOMParser && nodeList._isLiveNodeList) {
    317       throw new Error("Do not pass live node lists to _replaceNodeTags");
    318     }
    319     for (const node of nodeList) {
    320       this._setNodeTag(node, newTagName);
    321     }
    322   },
    323 
    324   /**
    325    * Iterate over a NodeList, which doesn't natively fully implement the Array
    326    * interface.
    327    *
    328    * For convenience, the current object context is applied to the provided
    329    * iterate function.
    330    *
    331    * @param  NodeList nodeList The NodeList.
    332    * @param  Function fn       The iterate function.
    333    * @return void
    334    */
    335   _forEachNode: function (nodeList, fn) {
    336     Array.prototype.forEach.call(nodeList, fn, this);
    337   },
    338 
    339   /**
    340    * Iterate over a NodeList, and return the first node that passes
    341    * the supplied test function
    342    *
    343    * For convenience, the current object context is applied to the provided
    344    * test function.
    345    *
    346    * @param  NodeList nodeList The NodeList.
    347    * @param  Function fn       The test function.
    348    * @return void
    349    */
    350   _findNode: function (nodeList, fn) {
    351     return Array.prototype.find.call(nodeList, fn, this);
    352   },
    353 
    354   /**
    355    * Iterate over a NodeList, return true if any of the provided iterate
    356    * function calls returns true, false otherwise.
    357    *
    358    * For convenience, the current object context is applied to the
    359    * provided iterate function.
    360    *
    361    * @param  NodeList nodeList The NodeList.
    362    * @param  Function fn       The iterate function.
    363    * @return Boolean
    364    */
    365   _someNode: function (nodeList, fn) {
    366     return Array.prototype.some.call(nodeList, fn, this);
    367   },
    368 
    369   /**
    370    * Iterate over a NodeList, return true if all of the provided iterate
    371    * function calls return true, false otherwise.
    372    *
    373    * For convenience, the current object context is applied to the
    374    * provided iterate function.
    375    *
    376    * @param  NodeList nodeList The NodeList.
    377    * @param  Function fn       The iterate function.
    378    * @return Boolean
    379    */
    380   _everyNode: function (nodeList, fn) {
    381     return Array.prototype.every.call(nodeList, fn, this);
    382   },
    383 
    384   /**
    385    * Concat all nodelists passed as arguments.
    386    *
    387    * @return ...NodeList
    388    * @return Array
    389    */
    390   _concatNodeLists: function () {
    391     var slice = Array.prototype.slice;
    392     var args = slice.call(arguments);
    393     var nodeLists = args.map(function (list) {
    394       return slice.call(list);
    395     });
    396     return Array.prototype.concat.apply([], nodeLists);
    397   },
    398 
    399   _getAllNodesWithTag: function (node, tagNames) {
    400     if (node.querySelectorAll) {
    401       return node.querySelectorAll(tagNames.join(","));
    402     }
    403     return [].concat.apply(
    404       [],
    405       tagNames.map(function (tag) {
    406         var collection = node.getElementsByTagName(tag);
    407         return Array.isArray(collection) ? collection : Array.from(collection);
    408       }),
    409     );
    410   },
    411 
    412   /**
    413    * Removes the class="" attribute from every element in the given
    414    * subtree, except those that match CLASSES_TO_PRESERVE and
    415    * the classesToPreserve array from the options object.
    416    *
    417    * @param Element
    418    * @return void
    419    */
    420   _cleanClasses: function (node) {
    421     var classesToPreserve = this._classesToPreserve;
    422     var className = (node.getAttribute("class") || "")
    423       .split(/\s+/)
    424       .filter(function (cls) {
    425         return classesToPreserve.indexOf(cls) != -1;
    426       })
    427       .join(" ");
    428 
    429     if (className) {
    430       node.setAttribute("class", className);
    431     } else {
    432       node.removeAttribute("class");
    433     }
    434 
    435     for (node = node.firstElementChild; node; node = node.nextElementSibling) {
    436       this._cleanClasses(node);
    437     }
    438   },
    439 
    440   /**
    441    * Converts each <a> and <img> uri in the given element to an absolute URI,
    442    * ignoring #ref URIs.
    443    *
    444    * @param Element
    445    * @return void
    446    */
    447   _fixRelativeUris: function (articleContent) {
    448     var baseURI = this._doc.baseURI;
    449     var documentURI = this._doc.documentURI;
    450     function toAbsoluteURI(uri) {
    451       // Leave hash links alone if the base URI matches the document URI:
    452       if (baseURI == documentURI && uri.charAt(0) == "#") {
    453         return uri;
    454       }
    455 
    456       // Otherwise, resolve against base URI:
    457       try {
    458         return new URL(uri, baseURI).href;
    459       } catch (ex) {
    460         // Something went wrong, just return the original:
    461       }
    462       return uri;
    463     }
    464 
    465     var links = this._getAllNodesWithTag(articleContent, ["a"]);
    466     this._forEachNode(links, function (link) {
    467       var href = link.getAttribute("href");
    468       if (href) {
    469         // Remove links with javascript: URIs, since
    470         // they won't work after scripts have been removed from the page.
    471         if (href.indexOf("javascript:") === 0) {
    472           // if the link only contains simple text content, it can be converted to a text node
    473           if (
    474             link.childNodes.length === 1 &&
    475             link.childNodes[0].nodeType === this.TEXT_NODE
    476           ) {
    477             var text = this._doc.createTextNode(link.textContent);
    478             link.parentNode.replaceChild(text, link);
    479           } else {
    480             // if the link has multiple children, they should all be preserved
    481             var container = this._doc.createElement("span");
    482             while (link.firstChild) {
    483               container.appendChild(link.firstChild);
    484             }
    485             link.parentNode.replaceChild(container, link);
    486           }
    487         } else {
    488           link.setAttribute("href", toAbsoluteURI(href));
    489         }
    490       }
    491     });
    492 
    493     var medias = this._getAllNodesWithTag(articleContent, [
    494       "img",
    495       "picture",
    496       "figure",
    497       "video",
    498       "audio",
    499       "source",
    500     ]);
    501 
    502     this._forEachNode(medias, function (media) {
    503       var src = media.getAttribute("src");
    504       var poster = media.getAttribute("poster");
    505       var srcset = media.getAttribute("srcset");
    506 
    507       if (src) {
    508         media.setAttribute("src", toAbsoluteURI(src));
    509       }
    510 
    511       if (poster) {
    512         media.setAttribute("poster", toAbsoluteURI(poster));
    513       }
    514 
    515       if (srcset) {
    516         var newSrcset = srcset.replace(
    517           this.REGEXPS.srcsetUrl,
    518           function (_, p1, p2, p3) {
    519             return toAbsoluteURI(p1) + (p2 || "") + p3;
    520           },
    521         );
    522 
    523         media.setAttribute("srcset", newSrcset);
    524       }
    525     });
    526   },
    527 
    528   _simplifyNestedElements: function (articleContent) {
    529     var node = articleContent;
    530 
    531     while (node) {
    532       if (
    533         node.parentNode &&
    534         ["DIV", "SECTION"].includes(node.tagName) &&
    535         !(node.id && node.id.startsWith("readability"))
    536       ) {
    537         if (this._isElementWithoutContent(node)) {
    538           node = this._removeAndGetNext(node);
    539           continue;
    540         } else if (
    541           this._hasSingleTagInsideElement(node, "DIV") ||
    542           this._hasSingleTagInsideElement(node, "SECTION")
    543         ) {
    544           var child = node.children[0];
    545           for (var i = 0; i < node.attributes.length; i++) {
    546             child.setAttribute(
    547               node.attributes[i].name,
    548               node.attributes[i].value,
    549             );
    550           }
    551           node.parentNode.replaceChild(child, node);
    552           node = child;
    553           continue;
    554         }
    555       }
    556 
    557       node = this._getNextNode(node);
    558     }
    559   },
    560 
    561   /**
    562    * Get the article title as an H1.
    563    *
    564    * @return string
    565    **/
    566   _getArticleTitle: function () {
    567     var doc = this._doc;
    568     var curTitle = "";
    569     var origTitle = "";
    570 
    571     try {
    572       curTitle = origTitle = doc.title.trim();
    573 
    574       // If they had an element with id "title" in their HTML
    575       if (typeof curTitle !== "string")
    576         curTitle = origTitle = this._getInnerText(
    577           doc.getElementsByTagName("title")[0],
    578         );
    579     } catch (e) {
    580       /* ignore exceptions setting the title. */
    581     }
    582 
    583     var titleHadHierarchicalSeparators = false;
    584     function wordCount(str) {
    585       return str.split(/\s+/).length;
    586     }
    587 
    588     // If there's a separator in the title, first remove the final part
    589     if (/ [\|\-\\\/>»] /.test(curTitle)) {
    590       titleHadHierarchicalSeparators = / [\\\/>»] /.test(curTitle);
    591       curTitle = origTitle.replace(/(.*)[\|\-\\\/>»] .*/gi, "$1");
    592 
    593       // If the resulting title is too short (3 words or fewer), remove
    594       // the first part instead:
    595       if (wordCount(curTitle) < 3)
    596         curTitle = origTitle.replace(/[^\|\-\\\/>»]*[\|\-\\\/>»](.*)/gi, "$1");
    597     } else if (curTitle.indexOf(": ") !== -1) {
    598       // Check if we have an heading containing this exact string, so we
    599       // could assume it's the full title.
    600       var headings = this._concatNodeLists(
    601         doc.getElementsByTagName("h1"),
    602         doc.getElementsByTagName("h2"),
    603       );
    604       var trimmedTitle = curTitle.trim();
    605       var match = this._someNode(headings, function (heading) {
    606         return heading.textContent.trim() === trimmedTitle;
    607       });
    608 
    609       // If we don't, let's extract the title out of the original title string.
    610       if (!match) {
    611         curTitle = origTitle.substring(origTitle.lastIndexOf(":") + 1);
    612 
    613         // If the title is now too short, try the first colon instead:
    614         if (wordCount(curTitle) < 3) {
    615           curTitle = origTitle.substring(origTitle.indexOf(":") + 1);
    616           // But if we have too many words before the colon there's something weird
    617           // with the titles and the H tags so let's just use the original title instead
    618         } else if (wordCount(origTitle.substr(0, origTitle.indexOf(":"))) > 5) {
    619           curTitle = origTitle;
    620         }
    621       }
    622     } else if (curTitle.length > 150 || curTitle.length < 15) {
    623       var hOnes = doc.getElementsByTagName("h1");
    624 
    625       if (hOnes.length === 1) curTitle = this._getInnerText(hOnes[0]);
    626     }
    627 
    628     curTitle = curTitle.trim().replace(this.REGEXPS.normalize, " ");
    629     // If we now have 4 words or fewer as our title, and either no
    630     // 'hierarchical' separators (\, /, > or ») were found in the original
    631     // title or we decreased the number of words by more than 1 word, use
    632     // the original title.
    633     var curTitleWordCount = wordCount(curTitle);
    634     if (
    635       curTitleWordCount <= 4 &&
    636       (!titleHadHierarchicalSeparators ||
    637         curTitleWordCount !=
    638           wordCount(origTitle.replace(/[\|\-\\\/>»]+/g, "")) - 1)
    639     ) {
    640       curTitle = origTitle;
    641     }
    642 
    643     return curTitle;
    644   },
    645 
    646   /**
    647    * Prepare the HTML document for readability to scrape it.
    648    * This includes things like stripping javascript, CSS, and handling terrible markup.
    649    *
    650    * @return void
    651    **/
    652   _prepDocument: function () {
    653     var doc = this._doc;
    654 
    655     // Remove all style tags in head
    656     this._removeNodes(this._getAllNodesWithTag(doc, ["style"]));
    657 
    658     if (doc.body) {
    659       this._replaceBrs(doc.body);
    660     }
    661 
    662     this._replaceNodeTags(this._getAllNodesWithTag(doc, ["font"]), "SPAN");
    663   },
    664 
    665   /**
    666    * Finds the next node, starting from the given node, and ignoring
    667    * whitespace in between. If the given node is an element, the same node is
    668    * returned.
    669    */
    670   _nextNode: function (node) {
    671     var next = node;
    672     while (
    673       next &&
    674       next.nodeType != this.ELEMENT_NODE &&
    675       this.REGEXPS.whitespace.test(next.textContent)
    676     ) {
    677       next = next.nextSibling;
    678     }
    679     return next;
    680   },
    681 
    682   /**
    683    * Replaces 2 or more successive <br> elements with a single <p>.
    684    * Whitespace between <br> elements are ignored. For example:
    685    *   <div>foo<br>bar<br> <br><br>abc</div>
    686    * will become:
    687    *   <div>foo<br>bar<p>abc</p></div>
    688    */
    689   _replaceBrs: function (elem) {
    690     this._forEachNode(this._getAllNodesWithTag(elem, ["br"]), function (br) {
    691       var next = br.nextSibling;
    692 
    693       // Whether 2 or more <br> elements have been found and replaced with a
    694       // <p> block.
    695       var replaced = false;
    696 
    697       // If we find a <br> chain, remove the <br>s until we hit another node
    698       // or non-whitespace. This leaves behind the first <br> in the chain
    699       // (which will be replaced with a <p> later).
    700       while ((next = this._nextNode(next)) && next.tagName == "BR") {
    701         replaced = true;
    702         var brSibling = next.nextSibling;
    703         next.parentNode.removeChild(next);
    704         next = brSibling;
    705       }
    706 
    707       // If we removed a <br> chain, replace the remaining <br> with a <p>. Add
    708       // all sibling nodes as children of the <p> until we hit another <br>
    709       // chain.
    710       if (replaced) {
    711         var p = this._doc.createElement("p");
    712         br.parentNode.replaceChild(p, br);
    713 
    714         next = p.nextSibling;
    715         while (next) {
    716           // If we've hit another <br><br>, we're done adding children to this <p>.
    717           if (next.tagName == "BR") {
    718             var nextElem = this._nextNode(next.nextSibling);
    719             if (nextElem && nextElem.tagName == "BR") break;
    720           }
    721 
    722           if (!this._isPhrasingContent(next)) break;
    723 
    724           // Otherwise, make this node a child of the new <p>.
    725           var sibling = next.nextSibling;
    726           p.appendChild(next);
    727           next = sibling;
    728         }
    729 
    730         while (p.lastChild && this._isWhitespace(p.lastChild)) {
    731           p.removeChild(p.lastChild);
    732         }
    733 
    734         if (p.parentNode.tagName === "P") this._setNodeTag(p.parentNode, "DIV");
    735       }
    736     });
    737   },
    738 
    739   _setNodeTag: function (node, tag) {
    740     this.log("_setNodeTag", node, tag);
    741     if (this._docJSDOMParser) {
    742       node.localName = tag.toLowerCase();
    743       node.tagName = tag.toUpperCase();
    744       return node;
    745     }
    746 
    747     var replacement = node.ownerDocument.createElement(tag);
    748     while (node.firstChild) {
    749       replacement.appendChild(node.firstChild);
    750     }
    751     node.parentNode.replaceChild(replacement, node);
    752     if (node.readability) replacement.readability = node.readability;
    753 
    754     for (var i = 0; i < node.attributes.length; i++) {
    755       try {
    756         replacement.setAttribute(
    757           node.attributes[i].name,
    758           node.attributes[i].value,
    759         );
    760       } catch (ex) {
    761         /* it's possible for setAttribute() to throw if the attribute name
    762          * isn't a valid XML Name. Such attributes can however be parsed from
    763          * source in HTML docs, see https://github.com/whatwg/html/issues/4275,
    764          * so we can hit them here and then throw. We don't care about such
    765          * attributes so we ignore them.
    766          */
    767       }
    768     }
    769     return replacement;
    770   },
    771 
    772   /**
    773    * Prepare the article node for display. Clean out any inline styles,
    774    * iframes, forms, strip extraneous <p> tags, etc.
    775    *
    776    * @param Element
    777    * @return void
    778    **/
    779   _prepArticle: function (articleContent) {
    780     this._cleanStyles(articleContent);
    781 
    782     // Check for data tables before we continue, to avoid removing items in
    783     // those tables, which will often be isolated even though they're
    784     // visually linked to other content-ful elements (text, images, etc.).
    785     this._markDataTables(articleContent);
    786 
    787     this._fixLazyImages(articleContent);
    788 
    789     // Clean out junk from the article content
    790     this._cleanConditionally(articleContent, "form");
    791     this._cleanConditionally(articleContent, "fieldset");
    792     this._clean(articleContent, "object");
    793     this._clean(articleContent, "embed");
    794     this._clean(articleContent, "footer");
    795     this._clean(articleContent, "link");
    796     this._clean(articleContent, "aside");
    797 
    798     // Clean out elements with little content that have "share" in their id/class combinations from final top candidates,
    799     // which means we don't remove the top candidates even they have "share".
    800 
    801     var shareElementThreshold = this.DEFAULT_CHAR_THRESHOLD;
    802 
    803     this._forEachNode(articleContent.children, function (topCandidate) {
    804       this._cleanMatchedNodes(topCandidate, function (node, matchString) {
    805         return (
    806           this.REGEXPS.shareElements.test(matchString) &&
    807           node.textContent.length < shareElementThreshold
    808         );
    809       });
    810     });
    811 
    812     this._clean(articleContent, "iframe");
    813     this._clean(articleContent, "input");
    814     this._clean(articleContent, "textarea");
    815     this._clean(articleContent, "select");
    816     this._clean(articleContent, "button");
    817     this._cleanHeaders(articleContent);
    818 
    819     // Do these last as the previous stuff may have removed junk
    820     // that will affect these
    821     this._cleanConditionally(articleContent, "table");
    822     this._cleanConditionally(articleContent, "ul");
    823     this._cleanConditionally(articleContent, "div");
    824 
    825     // replace H1 with H2 as H1 should be only title that is displayed separately
    826     this._replaceNodeTags(
    827       this._getAllNodesWithTag(articleContent, ["h1"]),
    828       "h2",
    829     );
    830 
    831     // Remove extra paragraphs
    832     this._removeNodes(
    833       this._getAllNodesWithTag(articleContent, ["p"]),
    834       function (paragraph) {
    835         var imgCount = paragraph.getElementsByTagName("img").length;
    836         var embedCount = paragraph.getElementsByTagName("embed").length;
    837         var objectCount = paragraph.getElementsByTagName("object").length;
    838         // At this point, nasty iframes have been removed, only remain embedded video ones.
    839         var iframeCount = paragraph.getElementsByTagName("iframe").length;
    840         var totalCount = imgCount + embedCount + objectCount + iframeCount;
    841 
    842         return totalCount === 0 && !this._getInnerText(paragraph, false);
    843       },
    844     );
    845 
    846     this._forEachNode(
    847       this._getAllNodesWithTag(articleContent, ["br"]),
    848       function (br) {
    849         var next = this._nextNode(br.nextSibling);
    850         if (next && next.tagName == "P") br.parentNode.removeChild(br);
    851       },
    852     );
    853 
    854     // Remove single-cell tables
    855     this._forEachNode(
    856       this._getAllNodesWithTag(articleContent, ["table"]),
    857       function (table) {
    858         var tbody = this._hasSingleTagInsideElement(table, "TBODY")
    859           ? table.firstElementChild
    860           : table;
    861         if (this._hasSingleTagInsideElement(tbody, "TR")) {
    862           var row = tbody.firstElementChild;
    863           if (this._hasSingleTagInsideElement(row, "TD")) {
    864             var cell = row.firstElementChild;
    865             cell = this._setNodeTag(
    866               cell,
    867               this._everyNode(cell.childNodes, this._isPhrasingContent)
    868                 ? "P"
    869                 : "DIV",
    870             );
    871             table.parentNode.replaceChild(cell, table);
    872           }
    873         }
    874       },
    875     );
    876   },
    877 
    878   /**
    879    * Initialize a node with the readability object. Also checks the
    880    * className/id for special names to add to its score.
    881    *
    882    * @param Element
    883    * @return void
    884    **/
    885   _initializeNode: function (node) {
    886     node.readability = { contentScore: 0 };
    887 
    888     switch (node.tagName) {
    889       case "DIV":
    890         node.readability.contentScore += 5;
    891         break;
    892 
    893       case "PRE":
    894       case "TD":
    895       case "BLOCKQUOTE":
    896         node.readability.contentScore += 3;
    897         break;
    898 
    899       case "ADDRESS":
    900       case "OL":
    901       case "UL":
    902       case "DL":
    903       case "DD":
    904       case "DT":
    905       case "LI":
    906       case "FORM":
    907         node.readability.contentScore -= 3;
    908         break;
    909 
    910       case "H1":
    911       case "H2":
    912       case "H3":
    913       case "H4":
    914       case "H5":
    915       case "H6":
    916       case "TH":
    917         node.readability.contentScore -= 5;
    918         break;
    919     }
    920 
    921     node.readability.contentScore += this._getClassWeight(node);
    922   },
    923 
    924   _removeAndGetNext: function (node) {
    925     var nextNode = this._getNextNode(node, true);
    926     node.parentNode.removeChild(node);
    927     return nextNode;
    928   },
    929 
    930   /**
    931    * Traverse the DOM from node to node, starting at the node passed in.
    932    * Pass true for the second parameter to indicate this node itself
    933    * (and its kids) are going away, and we want the next node over.
    934    *
    935    * Calling this in a loop will traverse the DOM depth-first.
    936    */
    937   _getNextNode: function (node, ignoreSelfAndKids) {
    938     // First check for kids if those aren't being ignored
    939     if (!ignoreSelfAndKids && node.firstElementChild) {
    940       return node.firstElementChild;
    941     }
    942     // Then for siblings...
    943     if (node.nextElementSibling) {
    944       return node.nextElementSibling;
    945     }
    946     // And finally, move up the parent chain *and* find a sibling
    947     // (because this is depth-first traversal, we will have already
    948     // seen the parent nodes themselves).
    949     do {
    950       node = node.parentNode;
    951     } while (node && !node.nextElementSibling);
    952     return node && node.nextElementSibling;
    953   },
    954 
    955   // compares second text to first one
    956   // 1 = same text, 0 = completely different text
    957   // works the way that it splits both texts into words and then finds words that are unique in second text
    958   // the result is given by the lower length of unique parts
    959   _textSimilarity: function (textA, textB) {
    960     var tokensA = textA
    961       .toLowerCase()
    962       .split(this.REGEXPS.tokenize)
    963       .filter(Boolean);
    964     var tokensB = textB
    965       .toLowerCase()
    966       .split(this.REGEXPS.tokenize)
    967       .filter(Boolean);
    968     if (!tokensA.length || !tokensB.length) {
    969       return 0;
    970     }
    971     var uniqTokensB = tokensB.filter((token) => !tokensA.includes(token));
    972     var distanceB = uniqTokensB.join(" ").length / tokensB.join(" ").length;
    973     return 1 - distanceB;
    974   },
    975 
    976   _checkByline: function (node, matchString) {
    977     if (this._articleByline) {
    978       return false;
    979     }
    980 
    981     if (node.getAttribute !== undefined) {
    982       var rel = node.getAttribute("rel");
    983       var itemprop = node.getAttribute("itemprop");
    984     }
    985 
    986     if (
    987       (rel === "author" ||
    988         (itemprop && itemprop.indexOf("author") !== -1) ||
    989         this.REGEXPS.byline.test(matchString)) &&
    990       this._isValidByline(node.textContent)
    991     ) {
    992       this._articleByline = node.textContent.trim();
    993       return true;
    994     }
    995 
    996     return false;
    997   },
    998 
    999   _getNodeAncestors: function (node, maxDepth) {
   1000     maxDepth = maxDepth || 0;
   1001     var i = 0,
   1002       ancestors = [];
   1003     while (node.parentNode) {
   1004       ancestors.push(node.parentNode);
   1005       if (maxDepth && ++i === maxDepth) break;
   1006       node = node.parentNode;
   1007     }
   1008     return ancestors;
   1009   },
   1010 
   1011   /***
   1012    * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
   1013    *         most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
   1014    *
   1015    * @param page a document to run upon. Needs to be a full document, complete with body.
   1016    * @return Element
   1017    **/
   1018   _grabArticle: function (page) {
   1019     this.log("**** grabArticle ****");
   1020     var doc = this._doc;
   1021     var isPaging = page !== null;
   1022     page = page ? page : this._doc.body;
   1023 
   1024     // We can't grab an article if we don't have a page!
   1025     if (!page) {
   1026       this.log("No body found in document. Abort.");
   1027       return null;
   1028     }
   1029 
   1030     var pageCacheHtml = page.innerHTML;
   1031 
   1032     while (true) {
   1033       this.log("Starting grabArticle loop");
   1034       var stripUnlikelyCandidates = this._flagIsActive(
   1035         this.FLAG_STRIP_UNLIKELYS,
   1036       );
   1037 
   1038       // First, node prepping. Trash nodes that look cruddy (like ones with the
   1039       // class name "comment", etc), and turn divs into P tags where they have been
   1040       // used inappropriately (as in, where they contain no other block level elements.)
   1041       var elementsToScore = [];
   1042       var node = this._doc.documentElement;
   1043 
   1044       let shouldRemoveTitleHeader = true;
   1045 
   1046       while (node) {
   1047         if (node.tagName === "HTML") {
   1048           this._articleLang = node.getAttribute("lang");
   1049         }
   1050 
   1051         var matchString = node.className + " " + node.id;
   1052 
   1053         if (!this._isProbablyVisible(node)) {
   1054           this.log("Removing hidden node - " + matchString);
   1055           node = this._removeAndGetNext(node);
   1056           continue;
   1057         }
   1058 
   1059         // Check to see if this node is a byline, and remove it if it is.
   1060         if (this._checkByline(node, matchString)) {
   1061           node = this._removeAndGetNext(node);
   1062           continue;
   1063         }
   1064 
   1065         if (shouldRemoveTitleHeader && this._headerDuplicatesTitle(node)) {
   1066           this.log(
   1067             "Removing header: ",
   1068             node.textContent.trim(),
   1069             this._articleTitle.trim(),
   1070           );
   1071           shouldRemoveTitleHeader = false;
   1072           node = this._removeAndGetNext(node);
   1073           continue;
   1074         }
   1075 
   1076         // Remove unlikely candidates
   1077         if (stripUnlikelyCandidates) {
   1078           if (
   1079             this.REGEXPS.unlikelyCandidates.test(matchString) &&
   1080             !this.REGEXPS.okMaybeItsACandidate.test(matchString) &&
   1081             !this._hasAncestorTag(node, "table") &&
   1082             !this._hasAncestorTag(node, "code") &&
   1083             node.tagName !== "BODY" &&
   1084             node.tagName !== "A"
   1085           ) {
   1086             this.log("Removing unlikely candidate - " + matchString);
   1087             node = this._removeAndGetNext(node);
   1088             continue;
   1089           }
   1090 
   1091           if (this.UNLIKELY_ROLES.includes(node.getAttribute("role"))) {
   1092             this.log(
   1093               "Removing content with role " +
   1094                 node.getAttribute("role") +
   1095                 " - " +
   1096                 matchString,
   1097             );
   1098             node = this._removeAndGetNext(node);
   1099             continue;
   1100           }
   1101         }
   1102 
   1103         // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe).
   1104         if (
   1105           (node.tagName === "DIV" ||
   1106             node.tagName === "SECTION" ||
   1107             node.tagName === "HEADER" ||
   1108             node.tagName === "H1" ||
   1109             node.tagName === "H2" ||
   1110             node.tagName === "H3" ||
   1111             node.tagName === "H4" ||
   1112             node.tagName === "H5" ||
   1113             node.tagName === "H6") &&
   1114           this._isElementWithoutContent(node)
   1115         ) {
   1116           node = this._removeAndGetNext(node);
   1117           continue;
   1118         }
   1119 
   1120         if (this.DEFAULT_TAGS_TO_SCORE.indexOf(node.tagName) !== -1) {
   1121           elementsToScore.push(node);
   1122         }
   1123 
   1124         // Turn all divs that don't have children block level elements into p's
   1125         if (node.tagName === "DIV") {
   1126           // Put phrasing content into paragraphs.
   1127           var p = null;
   1128           var childNode = node.firstChild;
   1129           while (childNode) {
   1130             var nextSibling = childNode.nextSibling;
   1131             if (this._isPhrasingContent(childNode)) {
   1132               if (p !== null) {
   1133                 p.appendChild(childNode);
   1134               } else if (!this._isWhitespace(childNode)) {
   1135                 p = doc.createElement("p");
   1136                 node.replaceChild(p, childNode);
   1137                 p.appendChild(childNode);
   1138               }
   1139             } else if (p !== null) {
   1140               while (p.lastChild && this._isWhitespace(p.lastChild)) {
   1141                 p.removeChild(p.lastChild);
   1142               }
   1143               p = null;
   1144             }
   1145             childNode = nextSibling;
   1146           }
   1147 
   1148           // Sites like http://mobile.slate.com encloses each paragraph with a DIV
   1149           // element. DIVs with only a P element inside and no text content can be
   1150           // safely converted into plain P elements to avoid confusing the scoring
   1151           // algorithm with DIVs with are, in practice, paragraphs.
   1152           if (
   1153             this._hasSingleTagInsideElement(node, "P") &&
   1154             this._getLinkDensity(node) < 0.25
   1155           ) {
   1156             var newNode = node.children[0];
   1157             node.parentNode.replaceChild(newNode, node);
   1158             node = newNode;
   1159             elementsToScore.push(node);
   1160           } else if (!this._hasChildBlockElement(node)) {
   1161             node = this._setNodeTag(node, "P");
   1162             elementsToScore.push(node);
   1163           }
   1164         }
   1165         node = this._getNextNode(node);
   1166       }
   1167 
   1168       /**
   1169        * Loop through all paragraphs, and assign a score to them based on how content-y they look.
   1170        * Then add their score to their parent node.
   1171        *
   1172        * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
   1173        **/
   1174       var candidates = [];
   1175       this._forEachNode(elementsToScore, function (elementToScore) {
   1176         if (
   1177           !elementToScore.parentNode ||
   1178           typeof elementToScore.parentNode.tagName === "undefined"
   1179         )
   1180           return;
   1181 
   1182         // If this paragraph is less than 25 characters, don't even count it.
   1183         var innerText = this._getInnerText(elementToScore);
   1184         if (innerText.length < 25) return;
   1185 
   1186         // Exclude nodes with no ancestor.
   1187         var ancestors = this._getNodeAncestors(elementToScore, 5);
   1188         if (ancestors.length === 0) return;
   1189 
   1190         var contentScore = 0;
   1191 
   1192         // Add a point for the paragraph itself as a base.
   1193         contentScore += 1;
   1194 
   1195         // Add points for any commas within this paragraph.
   1196         contentScore += innerText.split(",").length;
   1197 
   1198         // For every 100 characters in this paragraph, add another point. Up to 3 points.
   1199         contentScore += Math.min(Math.floor(innerText.length / 100), 3);
   1200 
   1201         // Initialize and score ancestors.
   1202         this._forEachNode(ancestors, function (ancestor, level) {
   1203           if (
   1204             !ancestor.tagName ||
   1205             !ancestor.parentNode ||
   1206             typeof ancestor.parentNode.tagName === "undefined"
   1207           )
   1208             return;
   1209 
   1210           if (typeof ancestor.readability === "undefined") {
   1211             this._initializeNode(ancestor);
   1212             candidates.push(ancestor);
   1213           }
   1214 
   1215           // Node score divider:
   1216           // - parent:             1 (no division)
   1217           // - grandparent:        2
   1218           // - great grandparent+: ancestor level * 3
   1219           if (level === 0) var scoreDivider = 1;
   1220           else if (level === 1) scoreDivider = 2;
   1221           else scoreDivider = level * 3;
   1222           ancestor.readability.contentScore += contentScore / scoreDivider;
   1223         });
   1224       });
   1225 
   1226       // After we've calculated scores, loop through all of the possible
   1227       // candidate nodes we found and find the one with the highest score.
   1228       var topCandidates = [];
   1229       for (var c = 0, cl = candidates.length; c < cl; c += 1) {
   1230         var candidate = candidates[c];
   1231 
   1232         // Scale the final candidates score based on link density. Good content
   1233         // should have a relatively small link density (5% or less) and be mostly
   1234         // unaffected by this operation.
   1235         var candidateScore =
   1236           candidate.readability.contentScore *
   1237           (1 - this._getLinkDensity(candidate));
   1238         candidate.readability.contentScore = candidateScore;
   1239 
   1240         this.log("Candidate:", candidate, "with score " + candidateScore);
   1241 
   1242         for (var t = 0; t < this._nbTopCandidates; t++) {
   1243           var aTopCandidate = topCandidates[t];
   1244 
   1245           if (
   1246             !aTopCandidate ||
   1247             candidateScore > aTopCandidate.readability.contentScore
   1248           ) {
   1249             topCandidates.splice(t, 0, candidate);
   1250             if (topCandidates.length > this._nbTopCandidates)
   1251               topCandidates.pop();
   1252             break;
   1253           }
   1254         }
   1255       }
   1256 
   1257       var topCandidate = topCandidates[0] || null;
   1258       var neededToCreateTopCandidate = false;
   1259       var parentOfTopCandidate;
   1260 
   1261       // If we still have no top candidate, just use the body as a last resort.
   1262       // We also have to copy the body node so it is something we can modify.
   1263       if (topCandidate === null || topCandidate.tagName === "BODY") {
   1264         // Move all of the page's children into topCandidate
   1265         topCandidate = doc.createElement("DIV");
   1266         neededToCreateTopCandidate = true;
   1267         // Move everything (not just elements, also text nodes etc.) into the container
   1268         // so we even include text directly in the body:
   1269         while (page.firstChild) {
   1270           this.log("Moving child out:", page.firstChild);
   1271           topCandidate.appendChild(page.firstChild);
   1272         }
   1273 
   1274         page.appendChild(topCandidate);
   1275 
   1276         this._initializeNode(topCandidate);
   1277       } else if (topCandidate) {
   1278         // Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array
   1279         // and whose scores are quite closed with current `topCandidate` node.
   1280         var alternativeCandidateAncestors = [];
   1281         for (var i = 1; i < topCandidates.length; i++) {
   1282           if (
   1283             topCandidates[i].readability.contentScore /
   1284               topCandidate.readability.contentScore >=
   1285             0.75
   1286           ) {
   1287             alternativeCandidateAncestors.push(
   1288               this._getNodeAncestors(topCandidates[i]),
   1289             );
   1290           }
   1291         }
   1292         var MINIMUM_TOPCANDIDATES = 3;
   1293         if (alternativeCandidateAncestors.length >= MINIMUM_TOPCANDIDATES) {
   1294           parentOfTopCandidate = topCandidate.parentNode;
   1295           while (parentOfTopCandidate.tagName !== "BODY") {
   1296             var listsContainingThisAncestor = 0;
   1297             for (
   1298               var ancestorIndex = 0;
   1299               ancestorIndex < alternativeCandidateAncestors.length &&
   1300               listsContainingThisAncestor < MINIMUM_TOPCANDIDATES;
   1301               ancestorIndex++
   1302             ) {
   1303               listsContainingThisAncestor += Number(
   1304                 alternativeCandidateAncestors[ancestorIndex].includes(
   1305                   parentOfTopCandidate,
   1306                 ),
   1307               );
   1308             }
   1309             if (listsContainingThisAncestor >= MINIMUM_TOPCANDIDATES) {
   1310               topCandidate = parentOfTopCandidate;
   1311               break;
   1312             }
   1313             parentOfTopCandidate = parentOfTopCandidate.parentNode;
   1314           }
   1315         }
   1316         if (!topCandidate.readability) {
   1317           this._initializeNode(topCandidate);
   1318         }
   1319 
   1320         // Because of our bonus system, parents of candidates might have scores
   1321         // themselves. They get half of the node. There won't be nodes with higher
   1322         // scores than our topCandidate, but if we see the score going *up* in the first
   1323         // few steps up the tree, that's a decent sign that there might be more content
   1324         // lurking in other places that we want to unify in. The sibling stuff
   1325         // below does some of that - but only if we've looked high enough up the DOM
   1326         // tree.
   1327         parentOfTopCandidate = topCandidate.parentNode;
   1328         var lastScore = topCandidate.readability.contentScore;
   1329         // The scores shouldn't get too low.
   1330         var scoreThreshold = lastScore / 3;
   1331         while (parentOfTopCandidate.tagName !== "BODY") {
   1332           if (!parentOfTopCandidate.readability) {
   1333             parentOfTopCandidate = parentOfTopCandidate.parentNode;
   1334             continue;
   1335           }
   1336           var parentScore = parentOfTopCandidate.readability.contentScore;
   1337           if (parentScore < scoreThreshold) break;
   1338           if (parentScore > lastScore) {
   1339             // Alright! We found a better parent to use.
   1340             topCandidate = parentOfTopCandidate;
   1341             break;
   1342           }
   1343           lastScore = parentOfTopCandidate.readability.contentScore;
   1344           parentOfTopCandidate = parentOfTopCandidate.parentNode;
   1345         }
   1346 
   1347         // If the top candidate is the only child, use parent instead. This will help sibling
   1348         // joining logic when adjacent content is actually located in parent's sibling node.
   1349         parentOfTopCandidate = topCandidate.parentNode;
   1350         while (
   1351           parentOfTopCandidate.tagName != "BODY" &&
   1352           parentOfTopCandidate.children.length == 1
   1353         ) {
   1354           topCandidate = parentOfTopCandidate;
   1355           parentOfTopCandidate = topCandidate.parentNode;
   1356         }
   1357         if (!topCandidate.readability) {
   1358           this._initializeNode(topCandidate);
   1359         }
   1360       }
   1361 
   1362       // Now that we have the top candidate, look through its siblings for content
   1363       // that might also be related. Things like preambles, content split by ads
   1364       // that we removed, etc.
   1365       var articleContent = doc.createElement("DIV");
   1366       if (isPaging) articleContent.id = "readability-content";
   1367 
   1368       var siblingScoreThreshold = Math.max(
   1369         10,
   1370         topCandidate.readability.contentScore * 0.2,
   1371       );
   1372       // Keep potential top candidate's parent node to try to get text direction of it later.
   1373       parentOfTopCandidate = topCandidate.parentNode;
   1374       var siblings = parentOfTopCandidate.children;
   1375 
   1376       for (var s = 0, sl = siblings.length; s < sl; s++) {
   1377         var sibling = siblings[s];
   1378         var append = false;
   1379 
   1380         this.log(
   1381           "Looking at sibling node:",
   1382           sibling,
   1383           sibling.readability
   1384             ? "with score " + sibling.readability.contentScore
   1385             : "",
   1386         );
   1387         this.log(
   1388           "Sibling has score",
   1389           sibling.readability ? sibling.readability.contentScore : "Unknown",
   1390         );
   1391 
   1392         if (sibling === topCandidate) {
   1393           append = true;
   1394         } else {
   1395           var contentBonus = 0;
   1396 
   1397           // Give a bonus if sibling nodes and top candidates have the example same classname
   1398           if (
   1399             sibling.className === topCandidate.className &&
   1400             topCandidate.className !== ""
   1401           )
   1402             contentBonus += topCandidate.readability.contentScore * 0.2;
   1403 
   1404           if (
   1405             sibling.readability &&
   1406             sibling.readability.contentScore + contentBonus >=
   1407               siblingScoreThreshold
   1408           ) {
   1409             append = true;
   1410           } else if (sibling.nodeName === "P") {
   1411             var linkDensity = this._getLinkDensity(sibling);
   1412             var nodeContent = this._getInnerText(sibling);
   1413             var nodeLength = nodeContent.length;
   1414 
   1415             if (nodeLength > 80 && linkDensity < 0.25) {
   1416               append = true;
   1417             } else if (
   1418               nodeLength < 80 &&
   1419               nodeLength > 0 &&
   1420               linkDensity === 0 &&
   1421               nodeContent.search(/\.( |$)/) !== -1
   1422             ) {
   1423               append = true;
   1424             }
   1425           }
   1426         }
   1427 
   1428         if (append) {
   1429           this.log("Appending node:", sibling);
   1430 
   1431           if (this.ALTER_TO_DIV_EXCEPTIONS.indexOf(sibling.nodeName) === -1) {
   1432             // We have a node that isn't a common block level element, like a form or td tag.
   1433             // Turn it into a div so it doesn't get filtered out later by accident.
   1434             this.log("Altering sibling:", sibling, "to div.");
   1435 
   1436             sibling = this._setNodeTag(sibling, "DIV");
   1437           }
   1438 
   1439           articleContent.appendChild(sibling);
   1440           // Fetch children again to make it compatible
   1441           // with DOM parsers without live collection support.
   1442           siblings = parentOfTopCandidate.children;
   1443           // siblings is a reference to the children array, and
   1444           // sibling is removed from the array when we call appendChild().
   1445           // As a result, we must revisit this index since the nodes
   1446           // have been shifted.
   1447           s -= 1;
   1448           sl -= 1;
   1449         }
   1450       }
   1451 
   1452       if (this._debug)
   1453         this.log("Article content pre-prep: " + articleContent.innerHTML);
   1454       // So we have all of the content that we need. Now we clean it up for presentation.
   1455       this._prepArticle(articleContent);
   1456       if (this._debug)
   1457         this.log("Article content post-prep: " + articleContent.innerHTML);
   1458 
   1459       if (neededToCreateTopCandidate) {
   1460         // We already created a fake div thing, and there wouldn't have been any siblings left
   1461         // for the previous loop, so there's no point trying to create a new div, and then
   1462         // move all the children over. Just assign IDs and class names here. No need to append
   1463         // because that already happened anyway.
   1464         topCandidate.id = "readability-page-1";
   1465         topCandidate.className = "page";
   1466       } else {
   1467         var div = doc.createElement("DIV");
   1468         div.id = "readability-page-1";
   1469         div.className = "page";
   1470         while (articleContent.firstChild) {
   1471           div.appendChild(articleContent.firstChild);
   1472         }
   1473         articleContent.appendChild(div);
   1474       }
   1475 
   1476       if (this._debug)
   1477         this.log("Article content after paging: " + articleContent.innerHTML);
   1478 
   1479       var parseSuccessful = true;
   1480 
   1481       // Now that we've gone through the full algorithm, check to see if
   1482       // we got any meaningful content. If we didn't, we may need to re-run
   1483       // grabArticle with different flags set. This gives us a higher likelihood of
   1484       // finding the content, and the sieve approach gives us a higher likelihood of
   1485       // finding the -right- content.
   1486       var textLength = this._getInnerText(articleContent, true).length;
   1487       if (textLength < this._charThreshold) {
   1488         parseSuccessful = false;
   1489         page.innerHTML = pageCacheHtml;
   1490 
   1491         if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) {
   1492           this._removeFlag(this.FLAG_STRIP_UNLIKELYS);
   1493           this._attempts.push({
   1494             articleContent: articleContent,
   1495             textLength: textLength,
   1496           });
   1497         } else if (this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) {
   1498           this._removeFlag(this.FLAG_WEIGHT_CLASSES);
   1499           this._attempts.push({
   1500             articleContent: articleContent,
   1501             textLength: textLength,
   1502           });
   1503         } else if (this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) {
   1504           this._removeFlag(this.FLAG_CLEAN_CONDITIONALLY);
   1505           this._attempts.push({
   1506             articleContent: articleContent,
   1507             textLength: textLength,
   1508           });
   1509         } else {
   1510           this._attempts.push({
   1511             articleContent: articleContent,
   1512             textLength: textLength,
   1513           });
   1514           // No luck after removing flags, just return the longest text we found during the different loops
   1515           this._attempts.sort(function (a, b) {
   1516             return b.textLength - a.textLength;
   1517           });
   1518 
   1519           // But first check if we actually have something
   1520           if (!this._attempts[0].textLength) {
   1521             return null;
   1522           }
   1523 
   1524           articleContent = this._attempts[0].articleContent;
   1525           parseSuccessful = true;
   1526         }
   1527       }
   1528 
   1529       if (parseSuccessful) {
   1530         // Find out text direction from ancestors of final top candidate.
   1531         var ancestors = [parentOfTopCandidate, topCandidate].concat(
   1532           this._getNodeAncestors(parentOfTopCandidate),
   1533         );
   1534         this._someNode(ancestors, function (ancestor) {
   1535           if (!ancestor.tagName) return false;
   1536           var articleDir = ancestor.getAttribute("dir");
   1537           if (articleDir) {
   1538             this._articleDir = articleDir;
   1539             return true;
   1540           }
   1541           return false;
   1542         });
   1543         return articleContent;
   1544       }
   1545     }
   1546   },
   1547 
   1548   /**
   1549    * Check whether the input string could be a byline.
   1550    * This verifies that the input is a string, and that the length
   1551    * is less than 100 chars.
   1552    *
   1553    * @param possibleByline {string} - a string to check whether its a byline.
   1554    * @return Boolean - whether the input string is a byline.
   1555    */
   1556   _isValidByline: function (byline) {
   1557     if (typeof byline == "string" || byline instanceof String) {
   1558       byline = byline.trim();
   1559       return byline.length > 0 && byline.length < 100;
   1560     }
   1561     return false;
   1562   },
   1563 
   1564   /**
   1565    * Converts some of the common HTML entities in string to their corresponding characters.
   1566    *
   1567    * @param str {string} - a string to unescape.
   1568    * @return string without HTML entity.
   1569    */
   1570   _unescapeHtmlEntities: function (str) {
   1571     if (!str) {
   1572       return str;
   1573     }
   1574 
   1575     var htmlEscapeMap = this.HTML_ESCAPE_MAP;
   1576     return str
   1577       .replace(/&(quot|amp|apos|lt|gt);/g, function (_, tag) {
   1578         return htmlEscapeMap[tag];
   1579       })
   1580       .replace(
   1581         /&#(?:x([0-9a-z]{1,4})|([0-9]{1,4}));/gi,
   1582         function (_, hex, numStr) {
   1583           var num = parseInt(hex || numStr, hex ? 16 : 10);
   1584           return String.fromCharCode(num);
   1585         },
   1586       );
   1587   },
   1588 
   1589   /**
   1590    * Try to extract metadata from JSON-LD object.
   1591    * For now, only Schema.org objects of type Article or its subtypes are supported.
   1592    * @return Object with any metadata that could be extracted (possibly none)
   1593    */
   1594   _getJSONLD: function (doc) {
   1595     var scripts = this._getAllNodesWithTag(doc, ["script"]);
   1596 
   1597     var metadata;
   1598 
   1599     this._forEachNode(scripts, function (jsonLdElement) {
   1600       if (
   1601         !metadata &&
   1602         jsonLdElement.getAttribute("type") === "application/ld+json"
   1603       ) {
   1604         try {
   1605           // Strip CDATA markers if present
   1606           var content = jsonLdElement.textContent.replace(
   1607             /^\s*<!\[CDATA\[|\]\]>\s*$/g,
   1608             "",
   1609           );
   1610           var parsed = JSON.parse(content);
   1611           if (
   1612             !parsed["@context"] ||
   1613             !parsed["@context"].match(/^https?\:\/\/schema\.org$/)
   1614           ) {
   1615             return;
   1616           }
   1617 
   1618           if (!parsed["@type"] && Array.isArray(parsed["@graph"])) {
   1619             parsed = parsed["@graph"].find(function (it) {
   1620               return (it["@type"] || "").match(this.REGEXPS.jsonLdArticleTypes);
   1621             });
   1622           }
   1623 
   1624           if (
   1625             !parsed ||
   1626             !parsed["@type"] ||
   1627             !parsed["@type"].match(this.REGEXPS.jsonLdArticleTypes)
   1628           ) {
   1629             return;
   1630           }
   1631 
   1632           metadata = {};
   1633 
   1634           if (
   1635             typeof parsed.name === "string" &&
   1636             typeof parsed.headline === "string" &&
   1637             parsed.name !== parsed.headline
   1638           ) {
   1639             // we have both name and headline element in the JSON-LD. They should both be the same but some websites like aktualne.cz
   1640             // put their own name into "name" and the article title to "headline" which confuses Readability. So we try to check if either
   1641             // "name" or "headline" closely matches the html title, and if so, use that one. If not, then we use "name" by default.
   1642 
   1643             var title = this._getArticleTitle();
   1644             var nameMatches = this._textSimilarity(parsed.name, title) > 0.75;
   1645             var headlineMatches =
   1646               this._textSimilarity(parsed.headline, title) > 0.75;
   1647 
   1648             if (headlineMatches && !nameMatches) {
   1649               metadata.title = parsed.headline;
   1650             } else {
   1651               metadata.title = parsed.name;
   1652             }
   1653           } else if (typeof parsed.name === "string") {
   1654             metadata.title = parsed.name.trim();
   1655           } else if (typeof parsed.headline === "string") {
   1656             metadata.title = parsed.headline.trim();
   1657           }
   1658           if (parsed.author) {
   1659             if (typeof parsed.author.name === "string") {
   1660               metadata.byline = parsed.author.name.trim();
   1661             } else if (
   1662               Array.isArray(parsed.author) &&
   1663               parsed.author[0] &&
   1664               typeof parsed.author[0].name === "string"
   1665             ) {
   1666               metadata.byline = parsed.author
   1667                 .filter(function (author) {
   1668                   return author && typeof author.name === "string";
   1669                 })
   1670                 .map(function (author) {
   1671                   return author.name.trim();
   1672                 })
   1673                 .join(", ");
   1674             }
   1675           }
   1676           if (typeof parsed.description === "string") {
   1677             metadata.excerpt = parsed.description.trim();
   1678           }
   1679           if (parsed.publisher && typeof parsed.publisher.name === "string") {
   1680             metadata.siteName = parsed.publisher.name.trim();
   1681           }
   1682           return;
   1683         } catch (err) {
   1684           this.log(err.message);
   1685         }
   1686       }
   1687     });
   1688     return metadata ? metadata : {};
   1689   },
   1690 
   1691   /**
   1692    * Attempts to get excerpt and byline metadata for the article.
   1693    *
   1694    * @param {Object} jsonld — object containing any metadata that
   1695    * could be extracted from JSON-LD object.
   1696    *
   1697    * @return Object with optional "excerpt" and "byline" properties
   1698    */
   1699   _getArticleMetadata: function (jsonld) {
   1700     var metadata = {};
   1701     var values = {};
   1702     var metaElements = this._doc.getElementsByTagName("meta");
   1703 
   1704     // property is a space-separated list of values
   1705     var propertyPattern =
   1706       /\s*(dc|dcterm|og|twitter)\s*:\s*(author|creator|description|title|site_name)\s*/gi;
   1707 
   1708     // name is a single value
   1709     var namePattern =
   1710       /^\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|site_name)\s*$/i;
   1711 
   1712     // Find description tags.
   1713     this._forEachNode(metaElements, function (element) {
   1714       var elementName = element.getAttribute("name");
   1715       var elementProperty = element.getAttribute("property");
   1716       var content = element.getAttribute("content");
   1717       if (!content) {
   1718         return;
   1719       }
   1720       var matches = null;
   1721       var name = null;
   1722 
   1723       if (elementProperty) {
   1724         matches = elementProperty.match(propertyPattern);
   1725         if (matches) {
   1726           // Convert to lowercase, and remove any whitespace
   1727           // so we can match below.
   1728           name = matches[0].toLowerCase().replace(/\s/g, "");
   1729           // multiple authors
   1730           values[name] = content.trim();
   1731         }
   1732       }
   1733       if (!matches && elementName && namePattern.test(elementName)) {
   1734         name = elementName;
   1735         if (content) {
   1736           // Convert to lowercase, remove any whitespace, and convert dots
   1737           // to colons so we can match below.
   1738           name = name.toLowerCase().replace(/\s/g, "").replace(/\./g, ":");
   1739           values[name] = content.trim();
   1740         }
   1741       }
   1742     });
   1743 
   1744     // get title
   1745     metadata.title =
   1746       jsonld.title ||
   1747       values["dc:title"] ||
   1748       values["dcterm:title"] ||
   1749       values["og:title"] ||
   1750       values["weibo:article:title"] ||
   1751       values["weibo:webpage:title"] ||
   1752       values["title"] ||
   1753       values["twitter:title"];
   1754 
   1755     if (!metadata.title) {
   1756       metadata.title = this._getArticleTitle();
   1757     }
   1758 
   1759     // get author
   1760     metadata.byline =
   1761       jsonld.byline ||
   1762       values["dc:creator"] ||
   1763       values["dcterm:creator"] ||
   1764       values["author"];
   1765 
   1766     // get description
   1767     metadata.excerpt =
   1768       jsonld.excerpt ||
   1769       values["dc:description"] ||
   1770       values["dcterm:description"] ||
   1771       values["og:description"] ||
   1772       values["weibo:article:description"] ||
   1773       values["weibo:webpage:description"] ||
   1774       values["description"] ||
   1775       values["twitter:description"];
   1776 
   1777     // get site name
   1778     metadata.siteName = jsonld.siteName || values["og:site_name"];
   1779 
   1780     // in many sites the meta value is escaped with HTML entities,
   1781     // so here we need to unescape it
   1782     metadata.title = this._unescapeHtmlEntities(metadata.title);
   1783     metadata.byline = this._unescapeHtmlEntities(metadata.byline);
   1784     metadata.excerpt = this._unescapeHtmlEntities(metadata.excerpt);
   1785     metadata.siteName = this._unescapeHtmlEntities(metadata.siteName);
   1786 
   1787     return metadata;
   1788   },
   1789 
   1790   /**
   1791    * Check if node is image, or if node contains exactly only one image
   1792    * whether as a direct child or as its descendants.
   1793    *
   1794    * @param Element
   1795    **/
   1796   _isSingleImage: function (node) {
   1797     if (node.tagName === "IMG") {
   1798       return true;
   1799     }
   1800 
   1801     if (node.children.length !== 1 || node.textContent.trim() !== "") {
   1802       return false;
   1803     }
   1804 
   1805     return this._isSingleImage(node.children[0]);
   1806   },
   1807 
   1808   /**
   1809    * Find all <noscript> that are located after <img> nodes, and which contain only one
   1810    * <img> element. Replace the first image with the image from inside the <noscript> tag,
   1811    * and remove the <noscript> tag. This improves the quality of the images we use on
   1812    * some sites (e.g. Medium).
   1813    *
   1814    * @param Element
   1815    **/
   1816   _unwrapNoscriptImages: function (doc) {
   1817     // Find img without source or attributes that might contains image, and remove it.
   1818     // This is done to prevent a placeholder img is replaced by img from noscript in next step.
   1819     var imgs = Array.from(doc.getElementsByTagName("img"));
   1820     this._forEachNode(imgs, function (img) {
   1821       for (var i = 0; i < img.attributes.length; i++) {
   1822         var attr = img.attributes[i];
   1823         switch (attr.name) {
   1824           case "src":
   1825           case "srcset":
   1826           case "data-src":
   1827           case "data-srcset":
   1828             return;
   1829         }
   1830 
   1831         if (/\.(jpg|jpeg|png|webp)/i.test(attr.value)) {
   1832           return;
   1833         }
   1834       }
   1835 
   1836       img.parentNode.removeChild(img);
   1837     });
   1838 
   1839     // Next find noscript and try to extract its image
   1840     var noscripts = Array.from(doc.getElementsByTagName("noscript"));
   1841     this._forEachNode(noscripts, function (noscript) {
   1842       // Parse content of noscript and make sure it only contains image
   1843       var tmp = doc.createElement("div");
   1844       tmp.innerHTML = noscript.innerHTML;
   1845       if (!this._isSingleImage(tmp)) {
   1846         return;
   1847       }
   1848 
   1849       // If noscript has previous sibling and it only contains image,
   1850       // replace it with noscript content. However we also keep old
   1851       // attributes that might contains image.
   1852       var prevElement = noscript.previousElementSibling;
   1853       if (prevElement && this._isSingleImage(prevElement)) {
   1854         var prevImg = prevElement;
   1855         if (prevImg.tagName !== "IMG") {
   1856           prevImg = prevElement.getElementsByTagName("img")[0];
   1857         }
   1858 
   1859         var newImg = tmp.getElementsByTagName("img")[0];
   1860         for (var i = 0; i < prevImg.attributes.length; i++) {
   1861           var attr = prevImg.attributes[i];
   1862           if (attr.value === "") {
   1863             continue;
   1864           }
   1865 
   1866           if (
   1867             attr.name === "src" ||
   1868             attr.name === "srcset" ||
   1869             /\.(jpg|jpeg|png|webp)/i.test(attr.value)
   1870           ) {
   1871             if (newImg.getAttribute(attr.name) === attr.value) {
   1872               continue;
   1873             }
   1874 
   1875             var attrName = attr.name;
   1876             if (newImg.hasAttribute(attrName)) {
   1877               attrName = "data-old-" + attrName;
   1878             }
   1879 
   1880             newImg.setAttribute(attrName, attr.value);
   1881           }
   1882         }
   1883 
   1884         noscript.parentNode.replaceChild(tmp.firstElementChild, prevElement);
   1885       }
   1886     });
   1887   },
   1888 
   1889   /**
   1890    * Removes script tags from the document.
   1891    *
   1892    * @param Element
   1893    **/
   1894   _removeScripts: function (doc) {
   1895     this._removeNodes(
   1896       this._getAllNodesWithTag(doc, ["script"]),
   1897       function (scriptNode) {
   1898         scriptNode.nodeValue = "";
   1899         scriptNode.removeAttribute("src");
   1900         return true;
   1901       },
   1902     );
   1903     this._removeNodes(this._getAllNodesWithTag(doc, ["noscript"]));
   1904   },
   1905 
   1906   /**
   1907    * Check if this node has only whitespace and a single element with given tag
   1908    * Returns false if the DIV node contains non-empty text nodes
   1909    * or if it contains no element with given tag or more than 1 element.
   1910    *
   1911    * @param Element
   1912    * @param string tag of child element
   1913    **/
   1914   _hasSingleTagInsideElement: function (element, tag) {
   1915     // There should be exactly 1 element child with given tag
   1916     if (element.children.length != 1 || element.children[0].tagName !== tag) {
   1917       return false;
   1918     }
   1919 
   1920     // And there should be no text nodes with real content
   1921     return !this._someNode(element.childNodes, function (node) {
   1922       return (
   1923         node.nodeType === this.TEXT_NODE &&
   1924         this.REGEXPS.hasContent.test(node.textContent)
   1925       );
   1926     });
   1927   },
   1928 
   1929   _isElementWithoutContent: function (node) {
   1930     return (
   1931       node.nodeType === this.ELEMENT_NODE &&
   1932       node.textContent.trim().length == 0 &&
   1933       (node.children.length == 0 ||
   1934         node.children.length ==
   1935           node.getElementsByTagName("br").length +
   1936             node.getElementsByTagName("hr").length)
   1937     );
   1938   },
   1939 
   1940   /**
   1941    * Determine whether element has any children block level elements.
   1942    *
   1943    * @param Element
   1944    */
   1945   _hasChildBlockElement: function (element) {
   1946     return this._someNode(element.childNodes, function (node) {
   1947       return (
   1948         this.DIV_TO_P_ELEMS.has(node.tagName) ||
   1949         this._hasChildBlockElement(node)
   1950       );
   1951     });
   1952   },
   1953 
   1954   /***
   1955    * Determine if a node qualifies as phrasing content.
   1956    * https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content
   1957    **/
   1958   _isPhrasingContent: function (node) {
   1959     return (
   1960       node.nodeType === this.TEXT_NODE ||
   1961       this.PHRASING_ELEMS.indexOf(node.tagName) !== -1 ||
   1962       ((node.tagName === "A" ||
   1963         node.tagName === "DEL" ||
   1964         node.tagName === "INS") &&
   1965         this._everyNode(node.childNodes, this._isPhrasingContent))
   1966     );
   1967   },
   1968 
   1969   _isWhitespace: function (node) {
   1970     return (
   1971       (node.nodeType === this.TEXT_NODE &&
   1972         node.textContent.trim().length === 0) ||
   1973       (node.nodeType === this.ELEMENT_NODE && node.tagName === "BR")
   1974     );
   1975   },
   1976 
   1977   /**
   1978    * Get the inner text of a node - cross browser compatibly.
   1979    * This also strips out any excess whitespace to be found.
   1980    *
   1981    * @param Element
   1982    * @param Boolean normalizeSpaces (default: true)
   1983    * @return string
   1984    **/
   1985   _getInnerText: function (e, normalizeSpaces) {
   1986     normalizeSpaces =
   1987       typeof normalizeSpaces === "undefined" ? true : normalizeSpaces;
   1988     var textContent = e.textContent.trim();
   1989 
   1990     if (normalizeSpaces) {
   1991       return textContent.replace(this.REGEXPS.normalize, " ");
   1992     }
   1993     return textContent;
   1994   },
   1995 
   1996   /**
   1997    * Get the number of times a string s appears in the node e.
   1998    *
   1999    * @param Element
   2000    * @param string - what to split on. Default is ","
   2001    * @return number (integer)
   2002    **/
   2003   _getCharCount: function (e, s) {
   2004     s = s || ",";
   2005     return this._getInnerText(e).split(s).length - 1;
   2006   },
   2007 
   2008   /**
   2009    * Remove the style attribute on every e and under.
   2010    * TODO: Test if getElementsByTagName(*) is faster.
   2011    *
   2012    * @param Element
   2013    * @return void
   2014    **/
   2015   _cleanStyles: function (e) {
   2016     if (!e || e.tagName.toLowerCase() === "svg") return;
   2017 
   2018     // Remove `style` and deprecated presentational attributes
   2019     for (var i = 0; i < this.PRESENTATIONAL_ATTRIBUTES.length; i++) {
   2020       e.removeAttribute(this.PRESENTATIONAL_ATTRIBUTES[i]);
   2021     }
   2022 
   2023     if (this.DEPRECATED_SIZE_ATTRIBUTE_ELEMS.indexOf(e.tagName) !== -1) {
   2024       e.removeAttribute("width");
   2025       e.removeAttribute("height");
   2026     }
   2027 
   2028     var cur = e.firstElementChild;
   2029     while (cur !== null) {
   2030       this._cleanStyles(cur);
   2031       cur = cur.nextElementSibling;
   2032     }
   2033   },
   2034 
   2035   /**
   2036    * Get the density of links as a percentage of the content
   2037    * This is the amount of text that is inside a link divided by the total text in the node.
   2038    *
   2039    * @param Element
   2040    * @return number (float)
   2041    **/
   2042   _getLinkDensity: function (element) {
   2043     var textLength = this._getInnerText(element).length;
   2044     if (textLength === 0) return 0;
   2045 
   2046     var linkLength = 0;
   2047 
   2048     // XXX implement _reduceNodeList?
   2049     this._forEachNode(element.getElementsByTagName("a"), function (linkNode) {
   2050       var href = linkNode.getAttribute("href");
   2051       var coefficient = href && this.REGEXPS.hashUrl.test(href) ? 0.3 : 1;
   2052       linkLength += this._getInnerText(linkNode).length * coefficient;
   2053     });
   2054 
   2055     return linkLength / textLength;
   2056   },
   2057 
   2058   /**
   2059    * Get an elements class/id weight. Uses regular expressions to tell if this
   2060    * element looks good or bad.
   2061    *
   2062    * @param Element
   2063    * @return number (Integer)
   2064    **/
   2065   _getClassWeight: function (e) {
   2066     if (!this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) return 0;
   2067 
   2068     var weight = 0;
   2069 
   2070     // Look for a special classname
   2071     if (typeof e.className === "string" && e.className !== "") {
   2072       if (this.REGEXPS.negative.test(e.className)) weight -= 25;
   2073 
   2074       if (this.REGEXPS.positive.test(e.className)) weight += 25;
   2075     }
   2076 
   2077     // Look for a special ID
   2078     if (typeof e.id === "string" && e.id !== "") {
   2079       if (this.REGEXPS.negative.test(e.id)) weight -= 25;
   2080 
   2081       if (this.REGEXPS.positive.test(e.id)) weight += 25;
   2082     }
   2083 
   2084     return weight;
   2085   },
   2086 
   2087   /**
   2088    * Clean a node of all elements of type "tag".
   2089    * (Unless it's a youtube/vimeo video. People love movies.)
   2090    *
   2091    * @param Element
   2092    * @param string tag to clean
   2093    * @return void
   2094    **/
   2095   _clean: function (e, tag) {
   2096     var isEmbed = ["object", "embed", "iframe"].indexOf(tag) !== -1;
   2097 
   2098     this._removeNodes(this._getAllNodesWithTag(e, [tag]), function (element) {
   2099       // Allow youtube and vimeo videos through as people usually want to see those.
   2100       if (isEmbed) {
   2101         // First, check the elements attributes to see if any of them contain youtube or vimeo
   2102         for (var i = 0; i < element.attributes.length; i++) {
   2103           if (this.REGEXPS.videos.test(element.attributes[i].value)) {
   2104             return false;
   2105           }
   2106         }
   2107 
   2108         // For embed with <object> tag, check inner HTML as well.
   2109         if (
   2110           element.tagName === "object" &&
   2111           this.REGEXPS.videos.test(element.innerHTML)
   2112         ) {
   2113           return false;
   2114         }
   2115       }
   2116 
   2117       return true;
   2118     });
   2119   },
   2120 
   2121   /**
   2122    * Check if a given node has one of its ancestor tag name matching the
   2123    * provided one.
   2124    * @param  HTMLElement node
   2125    * @param  String      tagName
   2126    * @param  Number      maxDepth
   2127    * @param  Function    filterFn a filter to invoke to determine whether this node 'counts'
   2128    * @return Boolean
   2129    */
   2130   _hasAncestorTag: function (node, tagName, maxDepth, filterFn) {
   2131     maxDepth = maxDepth || 3;
   2132     tagName = tagName.toUpperCase();
   2133     var depth = 0;
   2134     while (node.parentNode) {
   2135       if (maxDepth > 0 && depth > maxDepth) return false;
   2136       if (
   2137         node.parentNode.tagName === tagName &&
   2138         (!filterFn || filterFn(node.parentNode))
   2139       )
   2140         return true;
   2141       node = node.parentNode;
   2142       depth++;
   2143     }
   2144     return false;
   2145   },
   2146 
   2147   /**
   2148    * Return an object indicating how many rows and columns this table has.
   2149    */
   2150   _getRowAndColumnCount: function (table) {
   2151     var rows = 0;
   2152     var columns = 0;
   2153     var trs = table.getElementsByTagName("tr");
   2154     for (var i = 0; i < trs.length; i++) {
   2155       var rowspan = trs[i].getAttribute("rowspan") || 0;
   2156       if (rowspan) {
   2157         rowspan = parseInt(rowspan, 10);
   2158       }
   2159       rows += rowspan || 1;
   2160 
   2161       // Now look for column-related info
   2162       var columnsInThisRow = 0;
   2163       var cells = trs[i].getElementsByTagName("td");
   2164       for (var j = 0; j < cells.length; j++) {
   2165         var colspan = cells[j].getAttribute("colspan") || 0;
   2166         if (colspan) {
   2167           colspan = parseInt(colspan, 10);
   2168         }
   2169         columnsInThisRow += colspan || 1;
   2170       }
   2171       columns = Math.max(columns, columnsInThisRow);
   2172     }
   2173     return { rows: rows, columns: columns };
   2174   },
   2175 
   2176   /**
   2177    * Look for 'data' (as opposed to 'layout') tables, for which we use
   2178    * similar checks as
   2179    * https://searchfox.org/mozilla-central/rev/f82d5c549f046cb64ce5602bfd894b7ae807c8f8/accessible/generic/TableAccessible.cpp#19
   2180    */
   2181   _markDataTables: function (root) {
   2182     var tables = root.getElementsByTagName("table");
   2183     for (var i = 0; i < tables.length; i++) {
   2184       var table = tables[i];
   2185       var role = table.getAttribute("role");
   2186       if (role == "presentation") {
   2187         table._readabilityDataTable = false;
   2188         continue;
   2189       }
   2190       var datatable = table.getAttribute("datatable");
   2191       if (datatable == "0") {
   2192         table._readabilityDataTable = false;
   2193         continue;
   2194       }
   2195       var summary = table.getAttribute("summary");
   2196       if (summary) {
   2197         table._readabilityDataTable = true;
   2198         continue;
   2199       }
   2200 
   2201       var caption = table.getElementsByTagName("caption")[0];
   2202       if (caption && caption.childNodes.length > 0) {
   2203         table._readabilityDataTable = true;
   2204         continue;
   2205       }
   2206 
   2207       // If the table has a descendant with any of these tags, consider a data table:
   2208       var dataTableDescendants = ["col", "colgroup", "tfoot", "thead", "th"];
   2209       var descendantExists = function (tag) {
   2210         return !!table.getElementsByTagName(tag)[0];
   2211       };
   2212       if (dataTableDescendants.some(descendantExists)) {
   2213         this.log("Data table because found data-y descendant");
   2214         table._readabilityDataTable = true;
   2215         continue;
   2216       }
   2217 
   2218       // Nested tables indicate a layout table:
   2219       if (table.getElementsByTagName("table")[0]) {
   2220         table._readabilityDataTable = false;
   2221         continue;
   2222       }
   2223 
   2224       var sizeInfo = this._getRowAndColumnCount(table);
   2225       if (sizeInfo.rows >= 10 || sizeInfo.columns > 4) {
   2226         table._readabilityDataTable = true;
   2227         continue;
   2228       }
   2229       // Now just go by size entirely:
   2230       table._readabilityDataTable = sizeInfo.rows * sizeInfo.columns > 10;
   2231     }
   2232   },
   2233 
   2234   /* convert images and figures that have properties like data-src into images that can be loaded without JS */
   2235   _fixLazyImages: function (root) {
   2236     this._forEachNode(
   2237       this._getAllNodesWithTag(root, ["img", "picture", "figure"]),
   2238       function (elem) {
   2239         // In some sites (e.g. Kotaku), they put 1px square image as base64 data uri in the src attribute.
   2240         // So, here we check if the data uri is too short, just might as well remove it.
   2241         if (elem.src && this.REGEXPS.b64DataUrl.test(elem.src)) {
   2242           // Make sure it's not SVG, because SVG can have a meaningful image in under 133 bytes.
   2243           var parts = this.REGEXPS.b64DataUrl.exec(elem.src);
   2244           if (parts[1] === "image/svg+xml") {
   2245             return;
   2246           }
   2247 
   2248           // Make sure this element has other attributes which contains image.
   2249           // If it doesn't, then this src is important and shouldn't be removed.
   2250           var srcCouldBeRemoved = false;
   2251           for (var i = 0; i < elem.attributes.length; i++) {
   2252             var attr = elem.attributes[i];
   2253             if (attr.name === "src") {
   2254               continue;
   2255             }
   2256 
   2257             if (/\.(jpg|jpeg|png|webp)/i.test(attr.value)) {
   2258               srcCouldBeRemoved = true;
   2259               break;
   2260             }
   2261           }
   2262 
   2263           // Here we assume if image is less than 100 bytes (or 133B after encoded to base64)
   2264           // it will be too small, therefore it might be placeholder image.
   2265           if (srcCouldBeRemoved) {
   2266             var b64starts = elem.src.search(/base64\s*/i) + 7;
   2267             var b64length = elem.src.length - b64starts;
   2268             if (b64length < 133) {
   2269               elem.removeAttribute("src");
   2270             }
   2271           }
   2272         }
   2273 
   2274         // also check for "null" to work around https://github.com/jsdom/jsdom/issues/2580
   2275         if (
   2276           (elem.src || (elem.srcset && elem.srcset != "null")) &&
   2277           elem.className.toLowerCase().indexOf("lazy") === -1
   2278         ) {
   2279           return;
   2280         }
   2281 
   2282         for (var j = 0; j < elem.attributes.length; j++) {
   2283           attr = elem.attributes[j];
   2284           if (
   2285             attr.name === "src" ||
   2286             attr.name === "srcset" ||
   2287             attr.name === "alt"
   2288           ) {
   2289             continue;
   2290           }
   2291           var copyTo = null;
   2292           if (/\.(jpg|jpeg|png|webp)\s+\d/.test(attr.value)) {
   2293             copyTo = "srcset";
   2294           } else if (/^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$/.test(attr.value)) {
   2295             copyTo = "src";
   2296           }
   2297           if (copyTo) {
   2298             //if this is an img or picture, set the attribute directly
   2299             if (elem.tagName === "IMG" || elem.tagName === "PICTURE") {
   2300               elem.setAttribute(copyTo, attr.value);
   2301             } else if (
   2302               elem.tagName === "FIGURE" &&
   2303               !this._getAllNodesWithTag(elem, ["img", "picture"]).length
   2304             ) {
   2305               //if the item is a <figure> that does not contain an image or picture, create one and place it inside the figure
   2306               //see the nytimes-3 testcase for an example
   2307               var img = this._doc.createElement("img");
   2308               img.setAttribute(copyTo, attr.value);
   2309               elem.appendChild(img);
   2310             }
   2311           }
   2312         }
   2313       },
   2314     );
   2315   },
   2316 
   2317   _getTextDensity: function (e, tags) {
   2318     var textLength = this._getInnerText(e, true).length;
   2319     if (textLength === 0) {
   2320       return 0;
   2321     }
   2322     var childrenLength = 0;
   2323     var children = this._getAllNodesWithTag(e, tags);
   2324     this._forEachNode(
   2325       children,
   2326       (child) => (childrenLength += this._getInnerText(child, true).length),
   2327     );
   2328     return childrenLength / textLength;
   2329   },
   2330 
   2331   /**
   2332    * Clean an element of all tags of type "tag" if they look fishy.
   2333    * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
   2334    *
   2335    * @return void
   2336    **/
   2337   _cleanConditionally: function (e, tag) {
   2338     if (!this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) return;
   2339 
   2340     // Gather counts for other typical elements embedded within.
   2341     // Traverse backwards so we can remove nodes at the same time
   2342     // without effecting the traversal.
   2343     //
   2344     // TODO: Consider taking into account original contentScore here.
   2345     this._removeNodes(this._getAllNodesWithTag(e, [tag]), function (node) {
   2346       // First check if this node IS data table, in which case don't remove it.
   2347       var isDataTable = function (t) {
   2348         return t._readabilityDataTable;
   2349       };
   2350 
   2351       var isList = tag === "ul" || tag === "ol";
   2352       if (!isList) {
   2353         var listLength = 0;
   2354         var listNodes = this._getAllNodesWithTag(node, ["ul", "ol"]);
   2355         this._forEachNode(
   2356           listNodes,
   2357           (list) => (listLength += this._getInnerText(list).length),
   2358         );
   2359         isList = listLength / this._getInnerText(node).length > 0.9;
   2360       }
   2361 
   2362       if (tag === "table" && isDataTable(node)) {
   2363         return false;
   2364       }
   2365 
   2366       // Next check if we're inside a data table, in which case don't remove it as well.
   2367       if (this._hasAncestorTag(node, "table", -1, isDataTable)) {
   2368         return false;
   2369       }
   2370 
   2371       if (this._hasAncestorTag(node, "code")) {
   2372         return false;
   2373       }
   2374 
   2375       var weight = this._getClassWeight(node);
   2376 
   2377       this.log("Cleaning Conditionally", node);
   2378 
   2379       var contentScore = 0;
   2380 
   2381       if (weight + contentScore < 0) {
   2382         return true;
   2383       }
   2384 
   2385       if (this._getCharCount(node, ",") < 10) {
   2386         // If there are not very many commas, and the number of
   2387         // non-paragraph elements is more than paragraphs or other
   2388         // ominous signs, remove the element.
   2389         var p = node.getElementsByTagName("p").length;
   2390         var img = node.getElementsByTagName("img").length;
   2391         var li = node.getElementsByTagName("li").length - 100;
   2392         var input = node.getElementsByTagName("input").length;
   2393         var headingDensity = this._getTextDensity(node, [
   2394           "h1",
   2395           "h2",
   2396           "h3",
   2397           "h4",
   2398           "h5",
   2399           "h6",
   2400         ]);
   2401 
   2402         var embedCount = 0;
   2403         var embeds = this._getAllNodesWithTag(node, [
   2404           "object",
   2405           "embed",
   2406           "iframe",
   2407         ]);
   2408 
   2409         for (var i = 0; i < embeds.length; i++) {
   2410           // If this embed has attribute that matches video regex, don't delete it.
   2411           for (var j = 0; j < embeds[i].attributes.length; j++) {
   2412             if (this.REGEXPS.videos.test(embeds[i].attributes[j].value)) {
   2413               return false;
   2414             }
   2415           }
   2416 
   2417           // For embed with <object> tag, check inner HTML as well.
   2418           if (
   2419             embeds[i].tagName === "object" &&
   2420             this.REGEXPS.videos.test(embeds[i].innerHTML)
   2421           ) {
   2422             return false;
   2423           }
   2424 
   2425           embedCount++;
   2426         }
   2427 
   2428         var linkDensity = this._getLinkDensity(node);
   2429         var contentLength = this._getInnerText(node).length;
   2430 
   2431         var haveToRemove =
   2432           (img > 1 && p / img < 0.5 && !this._hasAncestorTag(node, "figure")) ||
   2433           (!isList && li > p) ||
   2434           input > Math.floor(p / 3) ||
   2435           (!isList &&
   2436             headingDensity < 0.9 &&
   2437             contentLength < 25 &&
   2438             (img === 0 || img > 2) &&
   2439             !this._hasAncestorTag(node, "figure")) ||
   2440           (!isList && weight < 25 && linkDensity > 0.2) ||
   2441           (weight >= 25 && linkDensity > 0.5) ||
   2442           (embedCount === 1 && contentLength < 75) ||
   2443           embedCount > 1;
   2444         // Allow simple lists of images to remain in pages
   2445         if (isList && haveToRemove) {
   2446           for (var x = 0; x < node.children.length; x++) {
   2447             let child = node.children[x];
   2448             // Don't filter in lists with li's that contain more than one child
   2449             if (child.children.length > 1) {
   2450               return haveToRemove;
   2451             }
   2452           }
   2453           li_count = node.getElementsByTagName("li").length;
   2454           // Only allow the list to remain if every li contains an image
   2455           if (img == li_count) {
   2456             return false;
   2457           }
   2458         }
   2459         return haveToRemove;
   2460       }
   2461       return false;
   2462     });
   2463   },
   2464 
   2465   /**
   2466    * Clean out elements that match the specified conditions
   2467    *
   2468    * @param Element
   2469    * @param Function determines whether a node should be removed
   2470    * @return void
   2471    **/
   2472   _cleanMatchedNodes: function (e, filter) {
   2473     var endOfSearchMarkerNode = this._getNextNode(e, true);
   2474     var next = this._getNextNode(e);
   2475     while (next && next != endOfSearchMarkerNode) {
   2476       if (filter.call(this, next, next.className + " " + next.id)) {
   2477         next = this._removeAndGetNext(next);
   2478       } else {
   2479         next = this._getNextNode(next);
   2480       }
   2481     }
   2482   },
   2483 
   2484   /**
   2485    * Clean out spurious headers from an Element.
   2486    *
   2487    * @param Element
   2488    * @return void
   2489    **/
   2490   _cleanHeaders: function (e) {
   2491     let headingNodes = this._getAllNodesWithTag(e, ["h1", "h2"]);
   2492     this._removeNodes(headingNodes, function (node) {
   2493       let shouldRemove = this._getClassWeight(node) < 0;
   2494       if (shouldRemove) {
   2495         this.log("Removing header with low class weight:", node);
   2496       }
   2497       return shouldRemove;
   2498     });
   2499   },
   2500 
   2501   /**
   2502    * Check if this node is an H1 or H2 element whose content is mostly
   2503    * the same as the article title.
   2504    *
   2505    * @param Element  the node to check.
   2506    * @return boolean indicating whether this is a title-like header.
   2507    */
   2508   _headerDuplicatesTitle: function (node) {
   2509     if (node.tagName != "H1" && node.tagName != "H2") {
   2510       return false;
   2511     }
   2512     var heading = this._getInnerText(node, false);
   2513     this.log("Evaluating similarity of header:", heading, this._articleTitle);
   2514     return this._textSimilarity(this._articleTitle, heading) > 0.75;
   2515   },
   2516 
   2517   _flagIsActive: function (flag) {
   2518     return (this._flags & flag) > 0;
   2519   },
   2520 
   2521   _removeFlag: function (flag) {
   2522     this._flags = this._flags & ~flag;
   2523   },
   2524 
   2525   _isProbablyVisible: function (node) {
   2526     // Have to null-check node.style and node.className.indexOf to deal with SVG and MathML nodes.
   2527     return (
   2528       (!node.style || node.style.display != "none") &&
   2529       !node.hasAttribute("hidden") &&
   2530       //check for "fallback-image" so that wikimedia math images are displayed
   2531       (!node.hasAttribute("aria-hidden") ||
   2532         node.getAttribute("aria-hidden") != "true" ||
   2533         (node.className &&
   2534           node.className.indexOf &&
   2535           node.className.indexOf("fallback-image") !== -1))
   2536     );
   2537   },
   2538 
   2539   /**
   2540    * Runs readability.
   2541    *
   2542    * Workflow:
   2543    *  1. Prep the document by removing script tags, css, etc.
   2544    *  2. Build readability's DOM tree.
   2545    *  3. Grab the article content from the current dom tree.
   2546    *  4. Replace the current DOM tree with the new one.
   2547    *  5. Read peacefully.
   2548    *
   2549    * @return void
   2550    **/
   2551   parse: function () {
   2552     // Avoid parsing too large documents, as per configuration option
   2553     if (this._maxElemsToParse > 0) {
   2554       var numTags = this._doc.getElementsByTagName("*").length;
   2555       if (numTags > this._maxElemsToParse) {
   2556         throw new Error(
   2557           "Aborting parsing document; " + numTags + " elements found",
   2558         );
   2559       }
   2560     }
   2561 
   2562     // Unwrap image from noscript
   2563     this._unwrapNoscriptImages(this._doc);
   2564 
   2565     // Extract JSON-LD metadata before removing scripts
   2566     var jsonLd = this._disableJSONLD ? {} : this._getJSONLD(this._doc);
   2567 
   2568     // Remove script tags from the document.
   2569     this._removeScripts(this._doc);
   2570 
   2571     this._prepDocument();
   2572 
   2573     var metadata = this._getArticleMetadata(jsonLd);
   2574     this._articleTitle = metadata.title;
   2575 
   2576     var articleContent = this._grabArticle();
   2577     if (!articleContent) return null;
   2578 
   2579     this.log("Grabbed: " + articleContent.innerHTML);
   2580 
   2581     this._postProcessContent(articleContent);
   2582 
   2583     // If we haven't found an excerpt in the article's metadata, use the article's
   2584     // first paragraph as the excerpt. This is used for displaying a preview of
   2585     // the article's content.
   2586     if (!metadata.excerpt) {
   2587       var paragraphs = articleContent.getElementsByTagName("p");
   2588       if (paragraphs.length > 0) {
   2589         metadata.excerpt = paragraphs[0].textContent.trim();
   2590       }
   2591     }
   2592 
   2593     var textContent = articleContent.textContent;
   2594     return {
   2595       title: this._articleTitle,
   2596       byline: metadata.byline || this._articleByline,
   2597       dir: this._articleDir,
   2598       lang: this._articleLang,
   2599       content: this._serializer(articleContent),
   2600       textContent: textContent,
   2601       length: textContent.length,
   2602       excerpt: metadata.excerpt,
   2603       siteName: metadata.siteName || this._articleSiteName,
   2604     };
   2605   },
   2606 };
   2607 
   2608 if (typeof module === "object") {
   2609   module.exports = Readability;
   2610 }
   2611 
   2612 /* Define a css stylesheet */
   2613 
   2614 var style_sheet_simple = `
   2615 <style type="text/css">
   2616 
   2617 body {
   2618       padding: 40px 200px 40px 200px !important;
   2619       font-size: 18px;
   2620       font: 18px/1.5 Roboto;
   2621       line-height: 1.6;
   2622       background-color: #FEFEFE !important;
   2623       color: #444 !important;
   2624       max-width: 99% !important;
   2625 }
   2626 
   2627 #readOverlay {
   2628        display: block;
   2629        position: absolute;
   2630        background-color: white;
   2631        top: 0;
   2632        left: 0;
   2633        width: 100%;
   2634 }
   2635 
   2636 /* Block quotes */
   2637 
   2638 blockquote{
   2639   width:60%;
   2640   margin: 5px auto;
   2641   font-style:italic;
   2642   color: #555555;
   2643   padding: 1.2em 30px 1.2em 75px;
   2644   border-left:8px solid #005386 ;
   2645   line-height:1.3;
   2646   position: relative;
   2647   background: #F0F0F0;
   2648 }
   2649 
   2650 blockquote::before{
   2651   font-family: Arial;
   2652   content: "\\201C";
   2653   color: #005386;
   2654   font-size: 6em;
   2655   position: absolute;
   2656   left: 10px;
   2657   top:-10px;
   2658 
   2659 }
   2660 
   2661 a[href^="#footnote-"] {
   2662     text-decoration: none;
   2663 }
   2664 a[href^="#footnote-"]::before {
   2665     content:" [";
   2666 }
   2667 a[href^="#footnote-"]::after {
   2668     content:"] ";
   2669 }
   2670 
   2671 </style>`;
   2672 
   2673 /*
   2674 body {
   2675        max-width: 650px;
   2676        margin: 40px auto;
   2677        padding: 0 10px;
   2678        font: 18px/1.5 -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, "Noto Sans", sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol", "Noto Color Emoji";
   2679        color: #444
   2680 }
   2681 */
   2682 /* See also
   2683  * <https://gist.github.com/aanand/399131>
   2684  * and the one included in firefox @ <chrome://global/skin/aboutReader.css>
   2685  */
   2686 var documentClone = document.cloneNode(true);
   2687 var article = new Readability(documentClone).parse();
   2688 document.head.innerHTML = `<title>${article.title}</title>\n${style_sheet_simple}`;
   2689 document.body.innerHTML = `<h1>${article.title}</h1>\n${article.content}`;
   2690 
   2691 /* Hack for archive.is */
   2692 var styles = `
   2693 img {
   2694   max-width: 80% !important;
   2695 	height: auto;
   2696   display: block;
   2697   margin-left: auto;
   2698   margin-right: auto;
   2699 }
   2700 `;
   2701 
   2702 if (document.domain == "archive.is") {
   2703   styles += `
   2704 
   2705   li > span {
   2706 			display: none !important;
   2707   }
   2708 
   2709 	/* Matt Levine's Money Stuff specific stuff: */
   2710 
   2711 	p > span > em {
   2712 			display: none !important;
   2713 	}
   2714   iframe {
   2715 	  display: none;
   2716   }
   2717 	#div[id^='stickypbModal'] {
   2718 	 display: none;
   2719 	}
   2720   `;
   2721 }
   2722 
   2723 var styleSheet = document.createElement("style");
   2724 styleSheet.innerText = styles;
   2725 document.head.appendChild(styleSheet);
   2726 console.log("Style changed");