Files
diskprices_extractor/diskprices.py
2024-08-12 06:43:22 +00:00

105 lines
2.8 KiB
Python

#!/usr/bin/env python
######################################################################
# Another (better) version of diskprices extractor using python. It's
# slow, but it works.
#
# Example:
#
# $ ./diskprices.py -d 20150821000331 -o csv
# $ ./diskprices.py -d 20150821000331 -o json
#
######################################################################
import sys
import argparse
import html
import httpx
from bs4 import BeautifulSoup
import csv
import json
def fetch(date, target="https://diskprices.com", timeout=30.0):
"""
fetch html from archive.org. httpx is used instead of default
http client because of redirections.
"""
host = ["https://web.archive.org","web", date, target]
url = "/".join(host)
response = httpx.get(url, follow_redirects=True, timeout=timeout)
return response.text
def parse(page):
"""
parse diskprices html page and convert table to a dict.
"""
parsed = BeautifulSoup(page, features="html.parser")
table = parsed.find(id="diskprices")
lines = parsed.find_all('tr')
result = []
for line in lines:
# first we looks for headers
headers = line.find_all('th')
if headers:
data = list(map(lambda x: string_converter(x.text), headers))
result.append(data)
next
# then we looks for items
items = line.find_all('td')
if items:
data = list(map(lambda x: string_converter(x.text), items))
result.append(data)
next
return result
def string_converter(text):
"""
sanitize and unencode content from each items.
"""
entities = html.unescape(text)
cleanup = entities.encode("ascii", "ignore")
utf8 = str(cleanup, 'utf-8')
return utf8
def to_csv(parsed):
"""
convert parsed results to csv.
"""
writer = csv.writer(sys.stdout, quotechar='"', delimiter=';', quoting=csv.QUOTE_MINIMAL)
for line in parsed:
writer.writerow(line)
def to_json(parsed):
"""
convert parsed results to json.
"""
headers = parsed.pop(0)
buffer = []
for line in parsed:
element = {}
index = 0
for item in line:
element[headers[index]] = item
index += 1
buffer.append(element)
return json.dumps(buffer)
if __name__ == '__main__':
parser = argparse.ArgumentParser(
prog='diskprices',
description='diskprices extractor',
)
parser.add_argument('-d', '--date', required=True)
parser.add_argument('-o', '--output', default="csv")
# parser.add_argument('-t', '--timeout')
args = parser.parse_args()
page = fetch(args.date)
parsed = parse(page)
if args.output == "json":
print(to_json(parsed))
else:
to_csv(parsed)