summaryrefslogtreecommitdiffstats
path: root/gather.py
diff options
context:
space:
mode:
authorAnton Luka Šijanec <anton@sijanec.eu>2022-11-07 12:43:33 +0100
committerAnton Luka Šijanec <anton@sijanec.eu>2022-11-07 12:43:33 +0100
commitd87288573e19a7aca802d172e80bbafbf692dc71 (patch)
tree0041df2568e0bc63012fb742f0291a2afe6f1f19 /gather.py
parentincreased how many failed acsms not 200 in a row to stop to 100 (diff)
downloadbiblos-stat-d87288573e19a7aca802d172e80bbafbf692dc71.tar
biblos-stat-d87288573e19a7aca802d172e80bbafbf692dc71.tar.gz
biblos-stat-d87288573e19a7aca802d172e80bbafbf692dc71.tar.bz2
biblos-stat-d87288573e19a7aca802d172e80bbafbf692dc71.tar.lz
biblos-stat-d87288573e19a7aca802d172e80bbafbf692dc71.tar.xz
biblos-stat-d87288573e19a7aca802d172e80bbafbf692dc71.tar.zst
biblos-stat-d87288573e19a7aca802d172e80bbafbf692dc71.zip
Diffstat (limited to '')
-rwxr-xr-xgather.py20
1 files changed, 14 insertions, 6 deletions
diff --git a/gather.py b/gather.py
index 8ce676e..d4beec6 100755
--- a/gather.py
+++ b/gather.py
@@ -10,7 +10,7 @@ try:
except ModuleNotFoundError:
raise ModuleNotFoundError("emerge dev-python/sqlalchemy or pip install SQLAlchemy")
try:
- from bs4 import BeautifulSoup
+ from bs4 import BeautifulSoup, FeatureNotFound
except ModuleNotFoundError:
raise ModuleNotFoundError("emerge dev-python/beautifulsoup4 or pip install beautifulsoup4")
@@ -29,7 +29,7 @@ class Book(Base):
creator = Column(String, nullable=True, doc="author of the book, dc:creator in acsm")
publisher = Column(String, nullable=True, doc="publisher of the book, dc:publisher in acsm")
identifier = Column(String, nullable=True, doc="if dc:identifier can't be derived from isbn, it's stored here. if dc:identifier element is missing, a literal string noidentifier is stored.")
- thumbnail_extension = Column(String, nullable=True, doc="thumbnails come in predictable URLs, derived from ISBN, apart from the extension. I've observed both jpg and png")
+ thumbnail_extension = Column(String, nullable=True, doc="thumbnails come in predictable URLs, derived from ISBN, apart from the extension. I've observed both jpg and png, may be None if there's no thumbnailURL element")
format = Column(String, nullable=True, doc="format of the file. I've seen application/pdf and application/epub+zip")
language = Column(String, nullable=True, doc="language of the book. I've seen sl.")
borrows = relationship("Borrow", back_populates="book");
@@ -106,7 +106,10 @@ try:
force_acsm_id = acsm_id+1
failed_acsms += 1
else:
- acsm = BeautifulSoup(r.text, "xml", from_encoding="UTF-8")
+ try:
+ acsm = BeautifulSoup(r.text, "xml", from_encoding="UTF-8")
+ except FeatureNotFound:
+ raise FeatureNotFound("pip3 install lxml")
ft = acsm.fulfillmentToken
expected = f"ACS-BIBL-L-{acsm_id}"
if ft.transaction.string != expected:
@@ -127,9 +130,14 @@ try:
raise ValueError(f"expected {expected} in ft.resourceItemInfo.licenseToken.resource.string but instead received {ft.resourceItemInfo.licenseToken.resource.string} in acsm {acsm_id}")
uuid = expected.split(":").pop()
expected = f"https://cs.alliance.inkbook.eu/books/{uuid}."
- if ft.resourceItemInfo.metadata.thumbnailURL.string.startswith(expected) != True:
- raise ValueError(f"expected {expected} in ft.resourceItemInfo.metadata.thumbnailURL.string but instead received {ft.resourceItemInfo.metadata.thumbnailURL.string} in acsm {acsm_id}")
- thumbnail_extension = ft.resourceItemInfo.metadata.thumbnailURL.string.split(".").pop()
+ try:
+ if ft.resourceItemInfo.metadata.thumbnailURL.string.startswith(expected) != True:
+ raise ValueError(f"expected {expected} in ft.resourceItemInfo.metadata.thumbnailURL.string but instead received {ft.resourceItemInfo.metadata.thumbnailURL.string} in acsm {acsm_id}")
+ thumbnail_extension = ft.resourceItemInfo.metadata.thumbnailURL.string.split(".").pop()
+ except AttributeError:
+ thumbnail_extension = None
+ if ft.resourceItemInfo.metadata.thumbnailURL != None:
+ raise ValueError(f"thumbnailURL actually exists, but it failed to be parsed in acsm {acsm_id}")
duration = int(ft.resourceItemInfo.licenseToken.permissions.display.duration.string)
if duration != int(ft.resourceItemInfo.licenseToken.permissions.play.duration.string):
raise ValueError(f"expected {duration} in fr.int(resourceItemInfo.licenseToken.permissions.play.duration.string) but instead received {int(resourceItemInfo.licenseToken.permissions.play.duration.string)} in acsm {acsm_id}")