longnow-for-markdown

Take a markdown file and feed links to the internet archive
Log | Files | Refs | README

longnow.sh (5932B)


      1 # Filenames
      2 input="$1"
      3 root="$(echo "$input" | sed 's/.md//g' )"
      4 links="$root.links.txt"
      5 archivedLinks="$root.links.archived.txt"
      6 errors="$root.errors.txt"
      7 output="$root.longnow.md"
      8 
      9 ## Directories
     10 initialDir="$(pwd)"
     11 workdir="longnow-$root"
     12 
     13 ## Move to work dir
     14 function moveToWorkDir(){
     15   mkdir -p "$workdir"
     16   cp "$input" "$workdir/$input"
     17   cd "$workdir"
     18 }
     19 
     20 ## Extract markdown links
     21 function extractMarkdownLinks(){ # Use: Takes a markdown file file.md, extracts all links, finds the unique ones and saves them to file.md.links
     22   links2="$root.links2.txt"
     23   echo ""
     24   echo "Extracting links..."
     25   
     26   rm -f "$links"
     27   grep -Eoi '\]\((.*)\)' "$input" | grep -Eo '(http|https)://[^)]+' >> "$links"
     28   
     29   awk '!seen[$0]++' "$links" > "$links2" && mv "$links2" "$links"
     30 
     31   echo "Done extracting links"
     32 }
     33 
     34 ## Push to Archive
     35 function pushToArchive(){
     36 # Use: Takes a txt file with one link on each line and pushes all the links to the internet archive. Saves those links to a textfile
     37 # References: 
     38 # https://unix.stackexchange.com/questions/181254/how-to-use-grep-and-cut-in-script-to-obtain-website-urls-from-an-html-file
     39 # https://github.com/oduwsdl/archivenow
     40 # For the double underscore, see: https://stackoverflow.com/questions/13797087/bash-why-double-underline-for-private-functions-why-for-bash-complet/15181999  
     41   
     42   echo ""
     43   echo "Pushing to archive.org..."
     44   numLinesLinkFile=$(wc -l "$links" | awk '{ print $1 }')
     45   totalTimeInMinutes=$(echo "scale=0; ($numLinesLinkFile*7.5 + 60*$numLinesLinkFile/15)/60" | bc)
     46   echo "Expected to take ~$totalTimeInMinutes mins."
     47   echo ""
     48 
     49   ## rm -f "$archivedLinks"
     50   rm -f "$errors"
     51   touch "$archivedLinks"
     52   touch "$errors"
     53   
     54   ## How to deal with errors that arise
     55   echo "If this file contains errors, you can deal with them as follows:" >> "$errors"
     56   echo "- Do another pass with \$ longnow yourfile.md. If you don't delete yourfile.md.links.archived, past archive links are remembered, and only the links which are not there are sent again"  >> "$errors"
     57   echo "- Input the offending links manually to https://archive.org/, add the results to the yourfile.md.links.archived file manually, and then do another pass with \$ longnow yourfile.md" >> "$errors"
     58   echo "" >> "$errors"
     59   
     60   ## Main body
     61   counter=1
     62   while IFS= read -r line
     63   do
     64     wait
     65     if [ $(($counter % 15)) -eq 0 ]; then
     66       printf "Archive.org doesn't accept more than 15 links per min; sleeping for 1min...\n\n"
     67       sleep 1m
     68     fi
     69     echo "Url: $line"
     70     urlAlreadyContainedInLocalArchivedLinks=$( ( grep "$line$" "$archivedLinks"; grep "$line/$" "$archivedLinks" )  | tail -1 )
     71 
     72     if [ "$urlAlreadyContainedInLocalArchivedLinks" == "" ]; then
     73       urlAlreadyInArchiveOnline="$(curl --silent http://archive.org/wayback/available?url=$line |  jq '.archived_snapshots.closest.url' | sed 's/"//g' | sed 's/null//g' )"
     74       if [ "$urlAlreadyInArchiveOnline" == "" ]; then
     75         echo "Sending to archive..."
     76         archiveURL=$(archivenow --ia $line)
     77         if [[ "$archiveURL" == "Error"* ]]; then
     78           echo "$line" >> "$errors"
     79           echo "$archiveURL" >> "$errors"
     80           echo "" >> "$errors"
     81           echo "There was an error. See $errors for how to deal with it."
     82 					echo ""
     83         else
     84             echo "$archiveURL" >> "$archivedLinks"
     85         fi
     86         counter=$((counter+1))
     87         numSecondsSleep=$((5+ ($RANDOM%15)))
     88       else
     89         echo "Already in archive.org: $urlAlreadyInArchiveOnline"
     90         echo "$urlAlreadyInArchiveOnline" >> "$archivedLinks"
     91 				echo ""
     92         numSecondsSleep=0
     93       fi
     94     elif [ ! -z "$urlAlreadyContainedInLocalArchivedLinks" ]; then
     95       echo "Already in local archive: $urlAlreadyContainedInLocalArchivedLinks"
     96       archiveURL="$urlAlreadyContainedInLocalArchivedLinks"
     97       numSecondsSleep=0
     98       # echo $archiveURL
     99       echo "Sleeping for $numSecondsSleep seconds..."
    100       sleep $numSecondsSleep
    101       echo ""
    102     fi
    103   done < "$links"
    104   
    105   echo "Done pushing links to archive.org"
    106   echo ""
    107 }
    108 
    109 ## Add archive links to file
    110 function addArchiveLinksToFile(){
    111     
    112   echo "Creating longnow file at $output"
    113 
    114   rm -f "$output"
    115   cp "$input" "$output"
    116   
    117   while IFS= read -r url
    118   do
    119     wait
    120     archivedUrl=$( ( grep "$url$" "$archivedLinks"; grep "$url/$" "$archivedLinks") | tail -1)
    121     if [ "$archivedUrl" != ""  ]; then
    122       ## echo "Url: $url"
    123       ## echo "ArchivedUrl: $archivedUrl"
    124       urlForSed="${url//\//\\/}"
    125       archiveUrlForSed="${archivedUrl//\//\\/}"
    126       sed -i "s/$urlForSed)/$urlForSed) ([a]($archiveUrlForSed))/g" "$output"
    127     ##else
    128       ##echo "There was an error for $url; see the $errorsFile"
    129     fi
    130   done < "$links"
    131   
    132   echo "Done."
    133 }
    134 
    135 ## Explain installation
    136 function explainArchiveNowInstallation(){
    137   echo "Required archivenow utility not found in path."
    138   echo "Install with \$ pip install archivenow"
    139   echo "(resp. \$ pip3 install archivenow)"
    140   echo "Or follow instructions on https://github.com/oduwsdl/archivenow"
    141 }
    142 
    143 function explainJqInstallation(){
    144   echo "Required jq utility not found in path."
    145   echo "Install with your package manager, e.g., \$ sudo apt install jq"
    146   echo "Or follow instructions on https://stedolan.github.io/jq/download/"
    147 }
    148 ## Report errors
    149 function reportErrors(){
    150   numLinesErrorFile=$(wc -l "$errors" | awk '{ print $1 }')
    151   if [ "$numLinesErrorFile" -gt 4 ]; then
    152     echo "It seems that there are errors. To view and deal with them, see the $errors file"
    153   fi
    154 }
    155 
    156 ## Clean up
    157 function cleanup(){
    158   cp "$output" "../$output"
    159   cd "$initialDir"
    160 }
    161 
    162 ## Main
    163 function main(){
    164   doesArchiveNowExist="$(whereis "archivenow")"
    165   doesJqExist="$(whereis "jq")"
    166   if [ "$doesArchiveNowExist" == "archivenow:" ]; then
    167     explainArchiveNowInstallation
    168 	elif [ "$doesJqExist" == "jq:" ]; then
    169 		explainJqInstallation
    170   else
    171     moveToWorkDir
    172     extractMarkdownLinks
    173     pushToArchive
    174     addArchiveLinksToFile
    175     reportErrors
    176     cleanup
    177   fi 
    178 }
    179 main
    180