longnow.sh (5932B)
1 # Filenames 2 input="$1" 3 root="$(echo "$input" | sed 's/.md//g' )" 4 links="$root.links.txt" 5 archivedLinks="$root.links.archived.txt" 6 errors="$root.errors.txt" 7 output="$root.longnow.md" 8 9 ## Directories 10 initialDir="$(pwd)" 11 workdir="longnow-$root" 12 13 ## Move to work dir 14 function moveToWorkDir(){ 15 mkdir -p "$workdir" 16 cp "$input" "$workdir/$input" 17 cd "$workdir" 18 } 19 20 ## Extract markdown links 21 function extractMarkdownLinks(){ # Use: Takes a markdown file file.md, extracts all links, finds the unique ones and saves them to file.md.links 22 links2="$root.links2.txt" 23 echo "" 24 echo "Extracting links..." 25 26 rm -f "$links" 27 grep -Eoi '\]\((.*)\)' "$input" | grep -Eo '(http|https)://[^)]+' >> "$links" 28 29 awk '!seen[$0]++' "$links" > "$links2" && mv "$links2" "$links" 30 31 echo "Done extracting links" 32 } 33 34 ## Push to Archive 35 function pushToArchive(){ 36 # Use: Takes a txt file with one link on each line and pushes all the links to the internet archive. Saves those links to a textfile 37 # References: 38 # https://unix.stackexchange.com/questions/181254/how-to-use-grep-and-cut-in-script-to-obtain-website-urls-from-an-html-file 39 # https://github.com/oduwsdl/archivenow 40 # For the double underscore, see: https://stackoverflow.com/questions/13797087/bash-why-double-underline-for-private-functions-why-for-bash-complet/15181999 41 42 echo "" 43 echo "Pushing to archive.org..." 44 numLinesLinkFile=$(wc -l "$links" | awk '{ print $1 }') 45 totalTimeInMinutes=$(echo "scale=0; ($numLinesLinkFile*7.5 + 60*$numLinesLinkFile/15)/60" | bc) 46 echo "Expected to take ~$totalTimeInMinutes mins." 47 echo "" 48 49 ## rm -f "$archivedLinks" 50 rm -f "$errors" 51 touch "$archivedLinks" 52 touch "$errors" 53 54 ## How to deal with errors that arise 55 echo "If this file contains errors, you can deal with them as follows:" >> "$errors" 56 echo "- Do another pass with \$ longnow yourfile.md. If you don't delete yourfile.md.links.archived, past archive links are remembered, and only the links which are not there are sent again" >> "$errors" 57 echo "- Input the offending links manually to https://archive.org/, add the results to the yourfile.md.links.archived file manually, and then do another pass with \$ longnow yourfile.md" >> "$errors" 58 echo "" >> "$errors" 59 60 ## Main body 61 counter=1 62 while IFS= read -r line 63 do 64 wait 65 if [ $(($counter % 15)) -eq 0 ]; then 66 printf "Archive.org doesn't accept more than 15 links per min; sleeping for 1min...\n\n" 67 sleep 1m 68 fi 69 echo "Url: $line" 70 urlAlreadyContainedInLocalArchivedLinks=$( ( grep "$line$" "$archivedLinks"; grep "$line/$" "$archivedLinks" ) | tail -1 ) 71 72 if [ "$urlAlreadyContainedInLocalArchivedLinks" == "" ]; then 73 urlAlreadyInArchiveOnline="$(curl --silent http://archive.org/wayback/available?url=$line | jq '.archived_snapshots.closest.url' | sed 's/"//g' | sed 's/null//g' )" 74 if [ "$urlAlreadyInArchiveOnline" == "" ]; then 75 echo "Sending to archive..." 76 archiveURL=$(archivenow --ia $line) 77 if [[ "$archiveURL" == "Error"* ]]; then 78 echo "$line" >> "$errors" 79 echo "$archiveURL" >> "$errors" 80 echo "" >> "$errors" 81 echo "There was an error. See $errors for how to deal with it." 82 echo "" 83 else 84 echo "$archiveURL" >> "$archivedLinks" 85 fi 86 counter=$((counter+1)) 87 numSecondsSleep=$((5+ ($RANDOM%15))) 88 else 89 echo "Already in archive.org: $urlAlreadyInArchiveOnline" 90 echo "$urlAlreadyInArchiveOnline" >> "$archivedLinks" 91 echo "" 92 numSecondsSleep=0 93 fi 94 elif [ ! -z "$urlAlreadyContainedInLocalArchivedLinks" ]; then 95 echo "Already in local archive: $urlAlreadyContainedInLocalArchivedLinks" 96 archiveURL="$urlAlreadyContainedInLocalArchivedLinks" 97 numSecondsSleep=0 98 # echo $archiveURL 99 echo "Sleeping for $numSecondsSleep seconds..." 100 sleep $numSecondsSleep 101 echo "" 102 fi 103 done < "$links" 104 105 echo "Done pushing links to archive.org" 106 echo "" 107 } 108 109 ## Add archive links to file 110 function addArchiveLinksToFile(){ 111 112 echo "Creating longnow file at $output" 113 114 rm -f "$output" 115 cp "$input" "$output" 116 117 while IFS= read -r url 118 do 119 wait 120 archivedUrl=$( ( grep "$url$" "$archivedLinks"; grep "$url/$" "$archivedLinks") | tail -1) 121 if [ "$archivedUrl" != "" ]; then 122 ## echo "Url: $url" 123 ## echo "ArchivedUrl: $archivedUrl" 124 urlForSed="${url//\//\\/}" 125 archiveUrlForSed="${archivedUrl//\//\\/}" 126 sed -i "s/$urlForSed)/$urlForSed) ([a]($archiveUrlForSed))/g" "$output" 127 ##else 128 ##echo "There was an error for $url; see the $errorsFile" 129 fi 130 done < "$links" 131 132 echo "Done." 133 } 134 135 ## Explain installation 136 function explainArchiveNowInstallation(){ 137 echo "Required archivenow utility not found in path." 138 echo "Install with \$ pip install archivenow" 139 echo "(resp. \$ pip3 install archivenow)" 140 echo "Or follow instructions on https://github.com/oduwsdl/archivenow" 141 } 142 143 function explainJqInstallation(){ 144 echo "Required jq utility not found in path." 145 echo "Install with your package manager, e.g., \$ sudo apt install jq" 146 echo "Or follow instructions on https://stedolan.github.io/jq/download/" 147 } 148 ## Report errors 149 function reportErrors(){ 150 numLinesErrorFile=$(wc -l "$errors" | awk '{ print $1 }') 151 if [ "$numLinesErrorFile" -gt 4 ]; then 152 echo "It seems that there are errors. To view and deal with them, see the $errors file" 153 fi 154 } 155 156 ## Clean up 157 function cleanup(){ 158 cp "$output" "../$output" 159 cd "$initialDir" 160 } 161 162 ## Main 163 function main(){ 164 doesArchiveNowExist="$(whereis "archivenow")" 165 doesJqExist="$(whereis "jq")" 166 if [ "$doesArchiveNowExist" == "archivenow:" ]; then 167 explainArchiveNowInstallation 168 elif [ "$doesJqExist" == "jq:" ]; then 169 explainJqInstallation 170 else 171 moveToWorkDir 172 extractMarkdownLinks 173 pushToArchive 174 addArchiveLinksToFile 175 reportErrors 176 cleanup 177 fi 178 } 179 main 180