added timestamp and remove all ';' from data.

This commit is contained in:
niamtokik
2024-08-12 07:38:46 +00:00
parent 30aba5bb06
commit 2d2d8175f5

View File

@@ -15,6 +15,7 @@ import argparse
import html
import httpx
from bs4 import BeautifulSoup
import re
import csv
import json
@@ -28,7 +29,7 @@ def fetch(date, target="https://diskprices.com", timeout=30.0):
response = httpx.get(url, follow_redirects=True, timeout=timeout)
return response.text
def parse(page):
def parse(page, date=None, timestamp=False):
"""
parse diskprices html page and convert table to a dict.
"""
@@ -41,6 +42,8 @@ def parse(page):
headers = line.find_all('th')
if headers:
data = list(map(lambda x: string_converter(x.text), headers))
if timestamp:
data.insert(0, "Date")
result.append(data)
next
@@ -48,8 +51,11 @@ def parse(page):
items = line.find_all('td')
if items:
data = list(map(lambda x: string_converter(x.text), items))
if timestamp:
data.insert(0, date)
result.append(data)
next
return result
def string_converter(text):
@@ -59,7 +65,8 @@ def string_converter(text):
entities = html.unescape(text)
cleanup = entities.encode("ascii", "ignore")
utf8 = str(cleanup, 'utf-8')
return utf8
cleanup_sep = re.sub(";", "", utf8)
return cleanup_sep
def to_csv(parsed):
"""
@@ -91,10 +98,13 @@ if __name__ == '__main__':
)
parser.add_argument('-d', '--date', required=True)
parser.add_argument('-o', '--output', default="csv")
parser.add_argument('-t', '--timestamp', default=False, action=argparse.BooleanOptionalAction)
# parser.add_argument('-t', '--timeout')
args = parser.parse_args()
page = fetch(args.date)
parsed = parse(page)
date = args.date
timestamp = args.timestamp
page = fetch(date)
parsed = parse(page, date=date, timestamp=timestamp)
if args.output == "json":
print(to_json(parsed))
else: