added timestamp and remove all ';' from data.
This commit is contained in:
@@ -15,6 +15,7 @@ import argparse
|
||||
import html
|
||||
import httpx
|
||||
from bs4 import BeautifulSoup
|
||||
import re
|
||||
import csv
|
||||
import json
|
||||
|
||||
@@ -28,7 +29,7 @@ def fetch(date, target="https://diskprices.com", timeout=30.0):
|
||||
response = httpx.get(url, follow_redirects=True, timeout=timeout)
|
||||
return response.text
|
||||
|
||||
def parse(page):
|
||||
def parse(page, date=None, timestamp=False):
|
||||
"""
|
||||
parse diskprices html page and convert table to a dict.
|
||||
"""
|
||||
@@ -41,6 +42,8 @@ def parse(page):
|
||||
headers = line.find_all('th')
|
||||
if headers:
|
||||
data = list(map(lambda x: string_converter(x.text), headers))
|
||||
if timestamp:
|
||||
data.insert(0, "Date")
|
||||
result.append(data)
|
||||
next
|
||||
|
||||
@@ -48,8 +51,11 @@ def parse(page):
|
||||
items = line.find_all('td')
|
||||
if items:
|
||||
data = list(map(lambda x: string_converter(x.text), items))
|
||||
if timestamp:
|
||||
data.insert(0, date)
|
||||
result.append(data)
|
||||
next
|
||||
|
||||
return result
|
||||
|
||||
def string_converter(text):
|
||||
@@ -59,7 +65,8 @@ def string_converter(text):
|
||||
entities = html.unescape(text)
|
||||
cleanup = entities.encode("ascii", "ignore")
|
||||
utf8 = str(cleanup, 'utf-8')
|
||||
return utf8
|
||||
cleanup_sep = re.sub(";", "", utf8)
|
||||
return cleanup_sep
|
||||
|
||||
def to_csv(parsed):
|
||||
"""
|
||||
@@ -91,10 +98,13 @@ if __name__ == '__main__':
|
||||
)
|
||||
parser.add_argument('-d', '--date', required=True)
|
||||
parser.add_argument('-o', '--output', default="csv")
|
||||
parser.add_argument('-t', '--timestamp', default=False, action=argparse.BooleanOptionalAction)
|
||||
# parser.add_argument('-t', '--timeout')
|
||||
args = parser.parse_args()
|
||||
page = fetch(args.date)
|
||||
parsed = parse(page)
|
||||
date = args.date
|
||||
timestamp = args.timestamp
|
||||
page = fetch(date)
|
||||
parsed = parse(page, date=date, timestamp=timestamp)
|
||||
if args.output == "json":
|
||||
print(to_json(parsed))
|
||||
else:
|
||||
|
||||
Reference in New Issue
Block a user