Files
diskprices_extractor/extractor.sh
2024-08-11 17:36:24 +00:00

64 lines
1.2 KiB
Bash
Executable File

#!/bin/sh
######################################################################
# quick and dirty diskprices.org parser/converter from html to csv.
######################################################################
_usage() {
printf "Usage: %s [html_file]\n" "${0}"
exit 1
}
_parser() {
grep -E -e '<t(d|h|r)' -e '<a' \
| sed -E -e 's:.*<tr.*:%:' \
-e 's:.*<td .*><a.*>(.*)</a></td>.*:\1:' \
-e 's:^.*<td>(.*)<.*$:\1:' \
-e 's:^.*<td class=.*>(.*)<.*$:\1:' \
-e 's:^.*<td class="name">::' \
-e 's:^.*<th.*>(.*)<.*$:\1:' \
-e 's:^.*<a.*>(.*)</a.*$:\1:'
}
_to_csv() {
awk ' BEGIN {
s = 0;
b = "";
}
$1 ~ /%/ {
print b;
b = ""
}
$1 ~ /.*/ && $1 !~ /%/ {
if (b=="") {
b = $0
}
else {
b = b ";" $0
}
}'
}
_csv_cleanup() {
sed -E \
-e '/^US;UK;DE;CA;ES;FR;IT;IN/d' \
-e '/^US;UK;DE;CA;ES/d' \
-e '/^US;UK;DE;CA;ES;FR;IT/d' \
-e '/^US;UK;DE;CA;ES;JP/d' \
-e '/^US;UK;DE;CA;ES;FR;IT;AU;NL/d' \
-e '/^Libera IRC/d' \
-e '/^Mailing list/d' \
-e '/^Learn more/d' \
-e '/^but how/d' \
-e '/^$/d' \
-e '/^webchat/d' \
-e 's/\&[A-Za-z0-9\#]+\;//g'
}
_main() {
cat ${1} | _parser | _to_csv | _csv_cleanup
}
test "${1}" || _usage
test -e "${1}" || _usage
_main "${1}"