feat: Use archive.org snapshot if it already exists. - longnow-for-markdown - Take a markdown file and feed links to the internet archive

commit d8515b22df3eab2d9c0940a21b85c2064a2756d7
parent 26b83deb8e6d80f7dc92b8e922c02371fde42153
Author: NunoSempere <nuno.sempere@protonmail.com>
Date:   Sat,  5 Mar 2022 10:50:30 -0500

feat: Use archive.org snapshot if it already exists.

Diffstat:
M README.md  | 11 ++++++++++-
D longnow  | 161 -------------------------------------------------------------------------------
A longnow.sh  | 180 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

3 files changed, 190 insertions(+), 162 deletions(-)
diff --git a/README.md b/README.md
@@ -4,7 +4,6 @@ I use it to archive links in [this forecasting newsletter](https://forecasting.s
 
 > Note to the future: All links are added automatically to the Internet Archive, using this [tool](https://github.com/NunoSempere/longNowForMd) ([a](https://web.archive.org/web/20220109144543/https://github.com/NunoSempere/longNowForMd)). "(a)" for archived links was inspired by [Milan Griffes](https://www.flightfromperfection.com/) ([a](https://web.archive.org/web/20220109144604/https://www.flightfromperfection.com/)), [Andrew Zuckerman](https://www.andzuck.com/) ([a](https://web.archive.org/web/20211202120912/https://www.andzuck.com/)), and [Alexey Guzey](https://guzey.com/) ([a](https://web.archive.org/web/20220109144733/https://guzey.com/)).
 
-
 ## How to install
 Add [this file](https://github.com/NunoSempere/longNowForMd/blob/master/longnow) to your path, for instance by moving it to the `/usr/bin` folder and giving it execute permissions (with `chmod 755 longnow`)
 
@@ -21,6 +20,16 @@ In addition, this utility requires [archivenow](https://github.com/oduwsdl/archi
 pip install archivenow ## respectively, pip3
 ```
 
+It also requires [jq](https://stedolan.github.io/jq/download/), which can be installed as:
+
+```
+sudo apt install jq
+```
+
+if on Debian, or using your distribution's package manager otherwise.
+
+As of the newest iteration of this program, if archive.org already has a snapshot of the page, that snapshot is taken instead. This results in massive time savings, but could imply that a less up to date copy is used. If this behavior is not desired, it can be easily excised manually, by removing the lines around `if [ "$urlAlreadyInArchiveOnline" == "" ]; then`.
+
 ## How to use
 
 ```
diff --git a/longnow b/longnow
@@ -1,161 +0,0 @@
-# Filenames
-input="$1"
-root="$(echo "$input" | sed 's/.md//g' )"
-links="$root.links.txt"
-archivedLinks="$root.links.archived.txt"
-errors="$root.errors.txt"
-output="$root.longnow.md"
-
-## Directories
-initialDir="$(pwd)"
-workdir="longnow-$root"
-
-## Move to work dir
-function moveToWorkDir(){
-  mkdir -p "$workdir"
-	cp "$input" "$workdir/$input"
-  cd "$workdir"
-}
-
-## Extract markdown links
-function extractMarkdownLinks(){ # Use: Takes a markdown file file.md, extracts all links, finds the unique ones and saves them to file.md.links
-  links2="$root.links2.txt"
-  echo ""
-  echo "Extracting links..."
-  
-  rm -f "$links"
-  grep -Eoi '\]\((.*)\)' "$input" | grep -Eo '(http|https)://[^)]+' >> "$links"
-  
-  awk '!seen[$0]++' "$links" > "$links2" && mv "$links2" "$links"
-
-  echo "Done extracting links"
-}
-
-## Push to Archive
-function pushToArchive(){
-# Use: Takes a txt file with one link on each line and pushes all the links to the internet archive. Saves those links to a textfile
-# References: 
-# https://unix.stackexchange.com/questions/181254/how-to-use-grep-and-cut-in-script-to-obtain-website-urls-from-an-html-file
-# https://github.com/oduwsdl/archivenow
-# For the double underscore, see: https://stackoverflow.com/questions/13797087/bash-why-double-underline-for-private-functions-why-for-bash-complet/15181999  
-  
-  echo ""
-  echo "Pushing to archive.org..."
-  numLinesLinkFile=$(wc -l "$links" | awk '{ print $1 }')
-  totalTimeInMinutes=$(echo "scale=0; ($numLinesLinkFile*7.5 + 60*$numLinesLinkFile/15)/60" | bc)
-  echo "Expected to take ~$totalTimeInMinutes mins."
-	echo ""
-
-  ## rm -f "$archivedLinks"
-  rm -f "$errors"
-  touch "$archivedLinks"
-  touch "$errors"
-  
-  ## How to deal with errors that arise
-  echo "If this file contains errors, you can deal with them as follows:" >> "$errors"
-  echo "- Do another pass with \$ longnow yourfile.md. If you don't delete yourfile.md.links.archived, past archive links are remembered, and only the links which are not there are sent again"  >> "$errors"
-  echo "- Input the offending links manually to https://archive.org/, add the results to the yourfile.md.links.archived file manually, and then do another pass with \$ longnow yourfile.md" >> "$errors"
-  echo "" >> "$errors"
-  
-  ## Main body
-  counter=1
-  while IFS= read -r line
-  do
-    wait
-    if [ $(($counter % 15)) -eq 0 ]; then
-      printf "Archive.org doesn't accept more than 15 links per min; sleeping for 1min...\n\n"
-      sleep 1m
-    fi
-    echo "Url: $line"
-    urlAlreadyContained=$( ( grep "$line$" "$archivedLinks"; grep "$line/$" "$archivedLinks" )  | tail -1 )
-    if [ "$urlAlreadyContained" == "" ]; then
-      archiveURL=$(archivenow --ia $line)
-      if [[ "$archiveURL" == "Error"* ]]; then
-        echo "$line" >> "$errors"
-        echo "$archiveURL" >> "$errors"
-        echo "" >> "$errors"
-        echo "There was an error. See $errors for how to deal with it."
-      else
-          echo "$archiveURL" >> "$archivedLinks"
-      fi
-      counter=$((counter+1))
-      numSecondsSleep=$((5+ ($RANDOM%15)))
-    else
-      archiveURL="$urlAlreadyContained"
-      numSecondsSleep=0
-    fi
-    echo $archiveURL
-    echo "Sleeping for $numSecondsSleep seconds..."
-    sleep $numSecondsSleep
-    echo ""
-  done < "$links"
-  
-  echo "Done pushing links to archive.org"
-  echo ""
-}
-
-## Add archive links to file
-function addArchiveLinksToFile(){
-    
-  echo "Creating longnow file at $output"
-
-  rm -f "$output"
-  cp "$input" "$output"
-  
-  while IFS= read -r url
-  do
-    wait
-    archivedUrl=$( ( grep "$url$" "$archivedLinks"; grep "$url/$" "$archivedLinks") | tail -1)
-    if [ "$archivedUrl" != ""  ]; then
-      ## echo "Url: $url"
-      ## echo "ArchivedUrl: $archivedUrl"
-      urlForSed="${url//\//\\/}"
-      archiveUrlForSed="${archivedUrl//\//\\/}"
-      sed -i "s/$urlForSed)/$urlForSed) ([a]($archiveUrlForSed))/g" "$output"
-    ##else
-      ##echo "There was an error for $url; see the $errorsFile"
-    fi
-  done < "$links"
-  
-  echo "Done."
-}
-
-## Explain installation
-function explainInstallation(){
-  echo "Required archivenow utility not found in path."
-  echo "Install with \$ pip install archivenow"
-  echo "(resp. \$ pip3 install archivenow)"
-  echo "Or follow instructions on https://github.com/oduwsdl/archivenow"
-}
-
-## Report errors
-function reportErrors(){
-  numLinesErrorFile=$(wc -l "$errors" | awk '{ print $1 }')
-  if [ "$numLinesErrorFile" -gt 4 ]; then
-    echo "It seems that there are errors. To view and deal with them, see the $errors file"
-  fi
-}
-
-## Clean up
-function cleanup(){
-  cp "$output" "../$output"
-  cd "$initialDir"
-}
-
-## Main
-function main(){
-  doesArchiveNowExist="$(whereis "archivenow")"
-  if [ "$doesArchiveNowExist" == "archivenow:" ]
-	then
-    explainInstallation
-  else
-		moveToWorkDir
-		extractMarkdownLinks
-		pushToArchive
-		addArchiveLinksToFile
-		reportErrors
-		cleanup
-  fi 
-}
-main
-
diff --git a/longnow.sh b/longnow.sh
@@ -0,0 +1,180 @@
+# Filenames
+input="$1"
+root="$(echo "$input" | sed 's/.md//g' )"
+links="$root.links.txt"
+archivedLinks="$root.links.archived.txt"
+errors="$root.errors.txt"
+output="$root.longnow.md"
+
+## Directories
+initialDir="$(pwd)"
+workdir="longnow-$root"
+
+## Move to work dir
+function moveToWorkDir(){
+  mkdir -p "$workdir"
+  cp "$input" "$workdir/$input"
+  cd "$workdir"
+}
+
+## Extract markdown links
+function extractMarkdownLinks(){ # Use: Takes a markdown file file.md, extracts all links, finds the unique ones and saves them to file.md.links
+  links2="$root.links2.txt"
+  echo ""
+  echo "Extracting links..."
+  
+  rm -f "$links"
+  grep -Eoi '\]\((.*)\)' "$input" | grep -Eo '(http|https)://[^)]+' >> "$links"
+  
+  awk '!seen[$0]++' "$links" > "$links2" && mv "$links2" "$links"
+
+  echo "Done extracting links"
+}
+
+## Push to Archive
+function pushToArchive(){
+# Use: Takes a txt file with one link on each line and pushes all the links to the internet archive. Saves those links to a textfile
+# References: 
+# https://unix.stackexchange.com/questions/181254/how-to-use-grep-and-cut-in-script-to-obtain-website-urls-from-an-html-file
+# https://github.com/oduwsdl/archivenow
+# For the double underscore, see: https://stackoverflow.com/questions/13797087/bash-why-double-underline-for-private-functions-why-for-bash-complet/15181999  
+  
+  echo ""
+  echo "Pushing to archive.org..."
+  numLinesLinkFile=$(wc -l "$links" | awk '{ print $1 }')
+  totalTimeInMinutes=$(echo "scale=0; ($numLinesLinkFile*7.5 + 60*$numLinesLinkFile/15)/60" | bc)
+  echo "Expected to take ~$totalTimeInMinutes mins."
+  echo ""
+
+  ## rm -f "$archivedLinks"
+  rm -f "$errors"
+  touch "$archivedLinks"
+  touch "$errors"
+  
+  ## How to deal with errors that arise
+  echo "If this file contains errors, you can deal with them as follows:" >> "$errors"
+  echo "- Do another pass with \$ longnow yourfile.md. If you don't delete yourfile.md.links.archived, past archive links are remembered, and only the links which are not there are sent again"  >> "$errors"
+  echo "- Input the offending links manually to https://archive.org/, add the results to the yourfile.md.links.archived file manually, and then do another pass with \$ longnow yourfile.md" >> "$errors"
+  echo "" >> "$errors"
+  
+  ## Main body
+  counter=1
+  while IFS= read -r line
+  do
+    wait
+    if [ $(($counter % 15)) -eq 0 ]; then
+      printf "Archive.org doesn't accept more than 15 links per min; sleeping for 1min...\n\n"
+      sleep 1m
+    fi
+    echo "Url: $line"
+    urlAlreadyContainedInLocalArchivedLinks=$( ( grep "$line$" "$archivedLinks"; grep "$line/$" "$archivedLinks" )  | tail -1 )
+
+    if [ "$urlAlreadyContainedInLocalArchivedLinks" == "" ]; then
+      urlAlreadyInArchiveOnline="$(curl --silent http://archive.org/wayback/available?url=$line |  jq '.archived_snapshots.closest.url' | sed 's/"//g' | sed 's/null//g' )"
+      if [ "$urlAlreadyInArchiveOnline" == "" ]; then
+        echo "Sending to archive..."
+        archiveURL=$(archivenow --ia $line)
+        if [[ "$archiveURL" == "Error"* ]]; then
+          echo "$line" >> "$errors"
+          echo "$archiveURL" >> "$errors"
+          echo "" >> "$errors"
+          echo "There was an error. See $errors for how to deal with it."
+					echo ""
+        else
+            echo "$archiveURL" >> "$archivedLinks"
+        fi
+        counter=$((counter+1))
+        numSecondsSleep=$((5+ ($RANDOM%15)))
+      else
+        echo "Already in archive.org: $urlAlreadyInArchiveOnline"
+        echo "$urlAlreadyInArchiveOnline" >> "$archivedLinks"
+				echo ""
+        numSecondsSleep=0
+      fi
+    elif [ ! -z "$urlAlreadyContainedInLocalArchivedLinks" ]; then
+      echo "Already in local archive: $urlAlreadyContainedInLocalArchivedLinks"
+      archiveURL="$urlAlreadyContainedInLocalArchivedLinks"
+      numSecondsSleep=0
+      # echo $archiveURL
+      echo "Sleeping for $numSecondsSleep seconds..."
+      sleep $numSecondsSleep
+      echo ""
+    fi
+  done < "$links"
+  
+  echo "Done pushing links to archive.org"
+  echo ""
+}
+
+## Add archive links to file
+function addArchiveLinksToFile(){
+    
+  echo "Creating longnow file at $output"
+
+  rm -f "$output"
+  cp "$input" "$output"
+  
+  while IFS= read -r url
+  do
+    wait
+    archivedUrl=$( ( grep "$url$" "$archivedLinks"; grep "$url/$" "$archivedLinks") | tail -1)
+    if [ "$archivedUrl" != ""  ]; then
+      ## echo "Url: $url"
+      ## echo "ArchivedUrl: $archivedUrl"
+      urlForSed="${url//\//\\/}"
+      archiveUrlForSed="${archivedUrl//\//\\/}"
+      sed -i "s/$urlForSed)/$urlForSed) ([a]($archiveUrlForSed))/g" "$output"
+    ##else
+      ##echo "There was an error for $url; see the $errorsFile"
+    fi
+  done < "$links"
+  
+  echo "Done."
+}
+
+## Explain installation
+function explainArchiveNowInstallation(){
+  echo "Required archivenow utility not found in path."
+  echo "Install with \$ pip install archivenow"
+  echo "(resp. \$ pip3 install archivenow)"
+  echo "Or follow instructions on https://github.com/oduwsdl/archivenow"
+}
+
+function explainJqInstallation(){
+  echo "Required jq utility not found in path."
+  echo "Install with your package manager, e.g., \$ sudo apt install jq"
+  echo "Or follow instructions on https://stedolan.github.io/jq/download/"
+}
+## Report errors
+function reportErrors(){
+  numLinesErrorFile=$(wc -l "$errors" | awk '{ print $1 }')
+  if [ "$numLinesErrorFile" -gt 4 ]; then
+    echo "It seems that there are errors. To view and deal with them, see the $errors file"
+  fi
+}
+
+## Clean up
+function cleanup(){
+  cp "$output" "../$output"
+  cd "$initialDir"
+}
+
+## Main
+function main(){
+  doesArchiveNowExist="$(whereis "archivenow")"
+  doesJqExist="$(whereis "jq")"
+  if [ "$doesArchiveNowExist" == "archivenow:" ]; then
+    explainArchiveNowInstallation
+	elif [ "$doesJqExist" == "jq:" ]; then
+		explainJqInstallation
+  else
+    moveToWorkDir
+    extractMarkdownLinks
+    pushToArchive
+    addArchiveLinksToFile
+    reportErrors
+    cleanup
+  fi 
+}
+main
+

	longnow-for-markdown Take a markdown file and feed links to the internet archive
	Log \| Files \| Refs \| README

M	README.md	\|	11	++++++++++-
D	longnow	\|	161	-------------------------------------------------------------------------------
A	longnow.sh	\|	180	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++