105 lines
2.8 KiB
Python
105 lines
2.8 KiB
Python
#!/usr/bin/env python
|
|
######################################################################
|
|
# Another (better) version of diskprices extractor using python. It's
|
|
# slow, but it works.
|
|
#
|
|
# Example:
|
|
#
|
|
# $ ./diskprices.py -d 20150821000331 -o csv
|
|
# $ ./diskprices.py -d 20150821000331 -o json
|
|
#
|
|
######################################################################
|
|
|
|
import sys
|
|
import argparse
|
|
import html
|
|
import httpx
|
|
from bs4 import BeautifulSoup
|
|
import csv
|
|
import json
|
|
|
|
def fetch(date, target="https://diskprices.com", timeout=30.0):
|
|
"""
|
|
fetch html from archive.org. httpx is used instead of default
|
|
http client because of redirections.
|
|
"""
|
|
host = ["https://web.archive.org","web", date, target]
|
|
url = "/".join(host)
|
|
response = httpx.get(url, follow_redirects=True, timeout=timeout)
|
|
return response.text
|
|
|
|
def parse(page):
|
|
"""
|
|
parse diskprices html page and convert table to a dict.
|
|
"""
|
|
parsed = BeautifulSoup(page, features="html.parser")
|
|
table = parsed.find(id="diskprices")
|
|
lines = parsed.find_all('tr')
|
|
result = []
|
|
for line in lines:
|
|
# first we looks for headers
|
|
headers = line.find_all('th')
|
|
if headers:
|
|
data = list(map(lambda x: string_converter(x.text), headers))
|
|
result.append(data)
|
|
next
|
|
|
|
# then we looks for items
|
|
items = line.find_all('td')
|
|
if items:
|
|
data = list(map(lambda x: string_converter(x.text), items))
|
|
result.append(data)
|
|
next
|
|
return result
|
|
|
|
def string_converter(text):
|
|
"""
|
|
sanitize and unencode content from each items.
|
|
"""
|
|
entities = html.unescape(text)
|
|
cleanup = entities.encode("ascii", "ignore")
|
|
utf8 = str(cleanup, 'utf-8')
|
|
return utf8
|
|
|
|
def to_csv(parsed):
|
|
"""
|
|
convert parsed results to csv.
|
|
"""
|
|
writer = csv.writer(sys.stdout, quotechar='"', delimiter=';', quoting=csv.QUOTE_MINIMAL)
|
|
for line in parsed:
|
|
writer.writerow(line)
|
|
|
|
def to_json(parsed):
|
|
"""
|
|
convert parsed results to json.
|
|
"""
|
|
headers = parsed.pop(0)
|
|
buffer = []
|
|
for line in parsed:
|
|
element = {}
|
|
index = 0
|
|
for item in line:
|
|
element[headers[index]] = item
|
|
index += 1
|
|
buffer.append(element)
|
|
return json.dumps(buffer)
|
|
|
|
if __name__ == '__main__':
|
|
parser = argparse.ArgumentParser(
|
|
prog='diskprices',
|
|
description='diskprices extractor',
|
|
)
|
|
parser.add_argument('-d', '--date', required=True)
|
|
parser.add_argument('-o', '--output', default="csv")
|
|
# parser.add_argument('-t', '--timeout')
|
|
args = parser.parse_args()
|
|
page = fetch(args.date)
|
|
parsed = parse(page)
|
|
if args.output == "json":
|
|
print(to_json(parsed))
|
|
else:
|
|
to_csv(parsed)
|
|
|
|
|
|
|