commit d8515b22df3eab2d9c0940a21b85c2064a2756d7
parent 26b83deb8e6d80f7dc92b8e922c02371fde42153
Author: NunoSempere <nuno.sempere@protonmail.com>
Date: Sat, 5 Mar 2022 10:50:30 -0500
feat: Use archive.org snapshot if it already exists.
Diffstat:
| M | README.md | | | 11 | ++++++++++- |
| D | longnow | | | 161 | ------------------------------------------------------------------------------- |
| A | longnow.sh | | | 180 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
3 files changed, 190 insertions(+), 162 deletions(-)
diff --git a/README.md b/README.md
@@ -4,7 +4,6 @@ I use it to archive links in [this forecasting newsletter](https://forecasting.s
> Note to the future: All links are added automatically to the Internet Archive, using this [tool](https://github.com/NunoSempere/longNowForMd) ([a](https://web.archive.org/web/20220109144543/https://github.com/NunoSempere/longNowForMd)). "(a)" for archived links was inspired by [Milan Griffes](https://www.flightfromperfection.com/) ([a](https://web.archive.org/web/20220109144604/https://www.flightfromperfection.com/)), [Andrew Zuckerman](https://www.andzuck.com/) ([a](https://web.archive.org/web/20211202120912/https://www.andzuck.com/)), and [Alexey Guzey](https://guzey.com/) ([a](https://web.archive.org/web/20220109144733/https://guzey.com/)).
-
## How to install
Add [this file](https://github.com/NunoSempere/longNowForMd/blob/master/longnow) to your path, for instance by moving it to the `/usr/bin` folder and giving it execute permissions (with `chmod 755 longnow`)
@@ -21,6 +20,16 @@ In addition, this utility requires [archivenow](https://github.com/oduwsdl/archi
pip install archivenow ## respectively, pip3
```
+It also requires [jq](https://stedolan.github.io/jq/download/), which can be installed as:
+
+```
+sudo apt install jq
+```
+
+if on Debian, or using your distribution's package manager otherwise.
+
+As of the newest iteration of this program, if archive.org already has a snapshot of the page, that snapshot is taken instead. This results in massive time savings, but could imply that a less up to date copy is used. If this behavior is not desired, it can be easily excised manually, by removing the lines around `if [ "$urlAlreadyInArchiveOnline" == "" ]; then`.
+
## How to use
```
diff --git a/longnow b/longnow
@@ -1,161 +0,0 @@
-# Filenames
-input="$1"
-root="$(echo "$input" | sed 's/.md//g' )"
-links="$root.links.txt"
-archivedLinks="$root.links.archived.txt"
-errors="$root.errors.txt"
-output="$root.longnow.md"
-
-## Directories
-initialDir="$(pwd)"
-workdir="longnow-$root"
-
-## Move to work dir
-function moveToWorkDir(){
- mkdir -p "$workdir"
- cp "$input" "$workdir/$input"
- cd "$workdir"
-}
-
-## Extract markdown links
-function extractMarkdownLinks(){ # Use: Takes a markdown file file.md, extracts all links, finds the unique ones and saves them to file.md.links
- links2="$root.links2.txt"
- echo ""
- echo "Extracting links..."
-
- rm -f "$links"
- grep -Eoi '\]\((.*)\)' "$input" | grep -Eo '(http|https)://[^)]+' >> "$links"
-
- awk '!seen[$0]++' "$links" > "$links2" && mv "$links2" "$links"
-
- echo "Done extracting links"
-}
-
-## Push to Archive
-function pushToArchive(){
-# Use: Takes a txt file with one link on each line and pushes all the links to the internet archive. Saves those links to a textfile
-# References:
-# https://unix.stackexchange.com/questions/181254/how-to-use-grep-and-cut-in-script-to-obtain-website-urls-from-an-html-file
-# https://github.com/oduwsdl/archivenow
-# For the double underscore, see: https://stackoverflow.com/questions/13797087/bash-why-double-underline-for-private-functions-why-for-bash-complet/15181999
-
- echo ""
- echo "Pushing to archive.org..."
- numLinesLinkFile=$(wc -l "$links" | awk '{ print $1 }')
- totalTimeInMinutes=$(echo "scale=0; ($numLinesLinkFile*7.5 + 60*$numLinesLinkFile/15)/60" | bc)
- echo "Expected to take ~$totalTimeInMinutes mins."
- echo ""
-
- ## rm -f "$archivedLinks"
- rm -f "$errors"
- touch "$archivedLinks"
- touch "$errors"
-
- ## How to deal with errors that arise
- echo "If this file contains errors, you can deal with them as follows:" >> "$errors"
- echo "- Do another pass with \$ longnow yourfile.md. If you don't delete yourfile.md.links.archived, past archive links are remembered, and only the links which are not there are sent again" >> "$errors"
- echo "- Input the offending links manually to https://archive.org/, add the results to the yourfile.md.links.archived file manually, and then do another pass with \$ longnow yourfile.md" >> "$errors"
- echo "" >> "$errors"
-
- ## Main body
- counter=1
- while IFS= read -r line
- do
- wait
- if [ $(($counter % 15)) -eq 0 ]; then
- printf "Archive.org doesn't accept more than 15 links per min; sleeping for 1min...\n\n"
- sleep 1m
- fi
- echo "Url: $line"
- urlAlreadyContained=$( ( grep "$line$" "$archivedLinks"; grep "$line/$" "$archivedLinks" ) | tail -1 )
- if [ "$urlAlreadyContained" == "" ]; then
- archiveURL=$(archivenow --ia $line)
- if [[ "$archiveURL" == "Error"* ]]; then
- echo "$line" >> "$errors"
- echo "$archiveURL" >> "$errors"
- echo "" >> "$errors"
- echo "There was an error. See $errors for how to deal with it."
- else
- echo "$archiveURL" >> "$archivedLinks"
- fi
- counter=$((counter+1))
- numSecondsSleep=$((5+ ($RANDOM%15)))
- else
- archiveURL="$urlAlreadyContained"
- numSecondsSleep=0
- fi
- echo $archiveURL
- echo "Sleeping for $numSecondsSleep seconds..."
- sleep $numSecondsSleep
- echo ""
- done < "$links"
-
- echo "Done pushing links to archive.org"
- echo ""
-}
-
-## Add archive links to file
-function addArchiveLinksToFile(){
-
- echo "Creating longnow file at $output"
-
- rm -f "$output"
- cp "$input" "$output"
-
- while IFS= read -r url
- do
- wait
- archivedUrl=$( ( grep "$url$" "$archivedLinks"; grep "$url/$" "$archivedLinks") | tail -1)
- if [ "$archivedUrl" != "" ]; then
- ## echo "Url: $url"
- ## echo "ArchivedUrl: $archivedUrl"
- urlForSed="${url//\//\\/}"
- archiveUrlForSed="${archivedUrl//\//\\/}"
- sed -i "s/$urlForSed)/$urlForSed) ([a]($archiveUrlForSed))/g" "$output"
- ##else
- ##echo "There was an error for $url; see the $errorsFile"
- fi
- done < "$links"
-
- echo "Done."
-}
-
-## Explain installation
-function explainInstallation(){
- echo "Required archivenow utility not found in path."
- echo "Install with \$ pip install archivenow"
- echo "(resp. \$ pip3 install archivenow)"
- echo "Or follow instructions on https://github.com/oduwsdl/archivenow"
-}
-
-## Report errors
-function reportErrors(){
- numLinesErrorFile=$(wc -l "$errors" | awk '{ print $1 }')
- if [ "$numLinesErrorFile" -gt 4 ]; then
- echo "It seems that there are errors. To view and deal with them, see the $errors file"
- fi
-}
-
-## Clean up
-function cleanup(){
- cp "$output" "../$output"
- cd "$initialDir"
-}
-
-## Main
-function main(){
- doesArchiveNowExist="$(whereis "archivenow")"
- if [ "$doesArchiveNowExist" == "archivenow:" ]
- then
- explainInstallation
- else
- moveToWorkDir
- extractMarkdownLinks
- pushToArchive
- addArchiveLinksToFile
- reportErrors
- cleanup
- fi
-}
-main
-
diff --git a/longnow.sh b/longnow.sh
@@ -0,0 +1,180 @@
+# Filenames
+input="$1"
+root="$(echo "$input" | sed 's/.md//g' )"
+links="$root.links.txt"
+archivedLinks="$root.links.archived.txt"
+errors="$root.errors.txt"
+output="$root.longnow.md"
+
+## Directories
+initialDir="$(pwd)"
+workdir="longnow-$root"
+
+## Move to work dir
+function moveToWorkDir(){
+ mkdir -p "$workdir"
+ cp "$input" "$workdir/$input"
+ cd "$workdir"
+}
+
+## Extract markdown links
+function extractMarkdownLinks(){ # Use: Takes a markdown file file.md, extracts all links, finds the unique ones and saves them to file.md.links
+ links2="$root.links2.txt"
+ echo ""
+ echo "Extracting links..."
+
+ rm -f "$links"
+ grep -Eoi '\]\((.*)\)' "$input" | grep -Eo '(http|https)://[^)]+' >> "$links"
+
+ awk '!seen[$0]++' "$links" > "$links2" && mv "$links2" "$links"
+
+ echo "Done extracting links"
+}
+
+## Push to Archive
+function pushToArchive(){
+# Use: Takes a txt file with one link on each line and pushes all the links to the internet archive. Saves those links to a textfile
+# References:
+# https://unix.stackexchange.com/questions/181254/how-to-use-grep-and-cut-in-script-to-obtain-website-urls-from-an-html-file
+# https://github.com/oduwsdl/archivenow
+# For the double underscore, see: https://stackoverflow.com/questions/13797087/bash-why-double-underline-for-private-functions-why-for-bash-complet/15181999
+
+ echo ""
+ echo "Pushing to archive.org..."
+ numLinesLinkFile=$(wc -l "$links" | awk '{ print $1 }')
+ totalTimeInMinutes=$(echo "scale=0; ($numLinesLinkFile*7.5 + 60*$numLinesLinkFile/15)/60" | bc)
+ echo "Expected to take ~$totalTimeInMinutes mins."
+ echo ""
+
+ ## rm -f "$archivedLinks"
+ rm -f "$errors"
+ touch "$archivedLinks"
+ touch "$errors"
+
+ ## How to deal with errors that arise
+ echo "If this file contains errors, you can deal with them as follows:" >> "$errors"
+ echo "- Do another pass with \$ longnow yourfile.md. If you don't delete yourfile.md.links.archived, past archive links are remembered, and only the links which are not there are sent again" >> "$errors"
+ echo "- Input the offending links manually to https://archive.org/, add the results to the yourfile.md.links.archived file manually, and then do another pass with \$ longnow yourfile.md" >> "$errors"
+ echo "" >> "$errors"
+
+ ## Main body
+ counter=1
+ while IFS= read -r line
+ do
+ wait
+ if [ $(($counter % 15)) -eq 0 ]; then
+ printf "Archive.org doesn't accept more than 15 links per min; sleeping for 1min...\n\n"
+ sleep 1m
+ fi
+ echo "Url: $line"
+ urlAlreadyContainedInLocalArchivedLinks=$( ( grep "$line$" "$archivedLinks"; grep "$line/$" "$archivedLinks" ) | tail -1 )
+
+ if [ "$urlAlreadyContainedInLocalArchivedLinks" == "" ]; then
+ urlAlreadyInArchiveOnline="$(curl --silent http://archive.org/wayback/available?url=$line | jq '.archived_snapshots.closest.url' | sed 's/"//g' | sed 's/null//g' )"
+ if [ "$urlAlreadyInArchiveOnline" == "" ]; then
+ echo "Sending to archive..."
+ archiveURL=$(archivenow --ia $line)
+ if [[ "$archiveURL" == "Error"* ]]; then
+ echo "$line" >> "$errors"
+ echo "$archiveURL" >> "$errors"
+ echo "" >> "$errors"
+ echo "There was an error. See $errors for how to deal with it."
+ echo ""
+ else
+ echo "$archiveURL" >> "$archivedLinks"
+ fi
+ counter=$((counter+1))
+ numSecondsSleep=$((5+ ($RANDOM%15)))
+ else
+ echo "Already in archive.org: $urlAlreadyInArchiveOnline"
+ echo "$urlAlreadyInArchiveOnline" >> "$archivedLinks"
+ echo ""
+ numSecondsSleep=0
+ fi
+ elif [ ! -z "$urlAlreadyContainedInLocalArchivedLinks" ]; then
+ echo "Already in local archive: $urlAlreadyContainedInLocalArchivedLinks"
+ archiveURL="$urlAlreadyContainedInLocalArchivedLinks"
+ numSecondsSleep=0
+ # echo $archiveURL
+ echo "Sleeping for $numSecondsSleep seconds..."
+ sleep $numSecondsSleep
+ echo ""
+ fi
+ done < "$links"
+
+ echo "Done pushing links to archive.org"
+ echo ""
+}
+
+## Add archive links to file
+function addArchiveLinksToFile(){
+
+ echo "Creating longnow file at $output"
+
+ rm -f "$output"
+ cp "$input" "$output"
+
+ while IFS= read -r url
+ do
+ wait
+ archivedUrl=$( ( grep "$url$" "$archivedLinks"; grep "$url/$" "$archivedLinks") | tail -1)
+ if [ "$archivedUrl" != "" ]; then
+ ## echo "Url: $url"
+ ## echo "ArchivedUrl: $archivedUrl"
+ urlForSed="${url//\//\\/}"
+ archiveUrlForSed="${archivedUrl//\//\\/}"
+ sed -i "s/$urlForSed)/$urlForSed) ([a]($archiveUrlForSed))/g" "$output"
+ ##else
+ ##echo "There was an error for $url; see the $errorsFile"
+ fi
+ done < "$links"
+
+ echo "Done."
+}
+
+## Explain installation
+function explainArchiveNowInstallation(){
+ echo "Required archivenow utility not found in path."
+ echo "Install with \$ pip install archivenow"
+ echo "(resp. \$ pip3 install archivenow)"
+ echo "Or follow instructions on https://github.com/oduwsdl/archivenow"
+}
+
+function explainJqInstallation(){
+ echo "Required jq utility not found in path."
+ echo "Install with your package manager, e.g., \$ sudo apt install jq"
+ echo "Or follow instructions on https://stedolan.github.io/jq/download/"
+}
+## Report errors
+function reportErrors(){
+ numLinesErrorFile=$(wc -l "$errors" | awk '{ print $1 }')
+ if [ "$numLinesErrorFile" -gt 4 ]; then
+ echo "It seems that there are errors. To view and deal with them, see the $errors file"
+ fi
+}
+
+## Clean up
+function cleanup(){
+ cp "$output" "../$output"
+ cd "$initialDir"
+}
+
+## Main
+function main(){
+ doesArchiveNowExist="$(whereis "archivenow")"
+ doesJqExist="$(whereis "jq")"
+ if [ "$doesArchiveNowExist" == "archivenow:" ]; then
+ explainArchiveNowInstallation
+ elif [ "$doesJqExist" == "jq:" ]; then
+ explainJqInstallation
+ else
+ moveToWorkDir
+ extractMarkdownLinks
+ pushToArchive
+ addArchiveLinksToFile
+ reportErrors
+ cleanup
+ fi
+}
+main
+