summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAnton Luka Šijanec <anton@sijanec.eu>2022-11-15 18:35:22 +0100
committerAnton Luka Šijanec <anton@sijanec.eu>2022-11-15 18:35:22 +0100
commit7b2b16af1bc952d6f283a72bebf7becacedbd748 (patch)
tree0f16f363ca1d25994cf288a062b4540ae802494c
parentfixed scoping (diff)
downloadbiblos-stat-7b2b16af1bc952d6f283a72bebf7becacedbd748.tar
biblos-stat-7b2b16af1bc952d6f283a72bebf7becacedbd748.tar.gz
biblos-stat-7b2b16af1bc952d6f283a72bebf7becacedbd748.tar.bz2
biblos-stat-7b2b16af1bc952d6f283a72bebf7becacedbd748.tar.lz
biblos-stat-7b2b16af1bc952d6f283a72bebf7becacedbd748.tar.xz
biblos-stat-7b2b16af1bc952d6f283a72bebf7becacedbd748.tar.zst
biblos-stat-7b2b16af1bc952d6f283a72bebf7becacedbd748.zip
-rw-r--r--.gitignore1
-rw-r--r--alembic.ini105
-rw-r--r--alembic/README1
-rw-r--r--alembic/env.py79
-rw-r--r--alembic/script.py.mako24
-rw-r--r--alembic/versions/4a3773e332a0_use_utc_datetime_in_db.py.old110
-rwxr-xr-xapp.py (renamed from gather.py)47
7 files changed, 346 insertions, 21 deletions
diff --git a/.gitignore b/.gitignore
index 65eef93..2b01084 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
db
+__pycache__
diff --git a/alembic.ini b/alembic.ini
new file mode 100644
index 0000000..9ae9ea7
--- /dev/null
+++ b/alembic.ini
@@ -0,0 +1,105 @@
+# A generic, single database configuration.
+
+[alembic]
+# path to migration scripts
+script_location = alembic
+
+# template used to generate migration file names; The default value is %%(rev)s_%%(slug)s
+# Uncomment the line below if you want the files to be prepended with date and time
+# see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file
+# for all available tokens
+# file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s
+
+# sys.path path, will be prepended to sys.path if present.
+# defaults to the current working directory.
+prepend_sys_path = .
+
+# timezone to use when rendering the date within the migration file
+# as well as the filename.
+# If specified, requires the python-dateutil library that can be
+# installed by adding `alembic[tz]` to the pip requirements
+# string value is passed to dateutil.tz.gettz()
+# leave blank for localtime
+# timezone =
+
+# max length of characters to apply to the
+# "slug" field
+# truncate_slug_length = 40
+
+# set to 'true' to run the environment during
+# the 'revision' command, regardless of autogenerate
+# revision_environment = false
+
+# set to 'true' to allow .pyc and .pyo files without
+# a source .py file to be detected as revisions in the
+# versions/ directory
+# sourceless = false
+
+# version location specification; This defaults
+# to alembic/versions. When using multiple version
+# directories, initial revisions must be specified with --version-path.
+# The path separator used here should be the separator specified by "version_path_separator" below.
+# version_locations = %(here)s/bar:%(here)s/bat:alembic/versions
+
+# version path separator; As mentioned above, this is the character used to split
+# version_locations. The default within new alembic.ini files is "os", which uses os.pathsep.
+# If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas.
+# Valid values for version_path_separator are:
+#
+# version_path_separator = :
+# version_path_separator = ;
+# version_path_separator = space
+version_path_separator = os # Use os.pathsep. Default configuration used for new projects.
+
+# the output encoding used when revision files
+# are written from script.py.mako
+# output_encoding = utf-8
+
+sqlalchemy.url = sqlite:///db
+
+
+[post_write_hooks]
+# post_write_hooks defines scripts or Python functions that are run
+# on newly generated revision scripts. See the documentation for further
+# detail and examples
+
+# format using "black" - use the console_scripts runner, against the "black" entrypoint
+# hooks = black
+# black.type = console_scripts
+# black.entrypoint = black
+# black.options = -l 79 REVISION_SCRIPT_FILENAME
+
+# Logging configuration
+[loggers]
+keys = root,sqlalchemy,alembic
+
+[handlers]
+keys = console
+
+[formatters]
+keys = generic
+
+[logger_root]
+level = WARN
+handlers = console
+qualname =
+
+[logger_sqlalchemy]
+level = WARN
+handlers =
+qualname = sqlalchemy.engine
+
+[logger_alembic]
+level = INFO
+handlers =
+qualname = alembic
+
+[handler_console]
+class = StreamHandler
+args = (sys.stderr,)
+level = NOTSET
+formatter = generic
+
+[formatter_generic]
+format = %(levelname)-5.5s [%(name)s] %(message)s
+datefmt = %H:%M:%S
diff --git a/alembic/README b/alembic/README
new file mode 100644
index 0000000..98e4f9c
--- /dev/null
+++ b/alembic/README
@@ -0,0 +1 @@
+Generic single-database configuration. \ No newline at end of file
diff --git a/alembic/env.py b/alembic/env.py
new file mode 100644
index 0000000..2e65ca4
--- /dev/null
+++ b/alembic/env.py
@@ -0,0 +1,79 @@
+from logging.config import fileConfig
+
+from sqlalchemy import engine_from_config
+from sqlalchemy import pool
+
+from alembic import context
+
+# this is the Alembic Config object, which provides
+# access to the values within the .ini file in use.
+config = context.config
+
+# Interpret the config file for Python logging.
+# This line sets up loggers basically.
+if config.config_file_name is not None:
+ fileConfig(config.config_file_name)
+
+# add your model's MetaData object here
+# for 'autogenerate' support
+# from myapp import mymodel
+# target_metadata = mymodel.Base.metadata
+from app import Base
+target_metadata = Base.metadata
+
+# other values from the config, defined by the needs of env.py,
+# can be acquired:
+# my_important_option = config.get_main_option("my_important_option")
+# ... etc.
+
+
+def run_migrations_offline() -> None:
+ """Run migrations in 'offline' mode.
+
+ This configures the context with just a URL
+ and not an Engine, though an Engine is acceptable
+ here as well. By skipping the Engine creation
+ we don't even need a DBAPI to be available.
+
+ Calls to context.execute() here emit the given string to the
+ script output.
+
+ """
+ url = config.get_main_option("sqlalchemy.url")
+ context.configure(
+ url=url,
+ target_metadata=target_metadata,
+ literal_binds=True,
+ dialect_opts={"paramstyle": "named"},
+ )
+
+ with context.begin_transaction():
+ context.run_migrations()
+
+
+def run_migrations_online() -> None:
+ """Run migrations in 'online' mode.
+
+ In this scenario we need to create an Engine
+ and associate a connection with the context.
+
+ """
+ connectable = engine_from_config(
+ config.get_section(config.config_ini_section),
+ prefix="sqlalchemy.",
+ poolclass=pool.NullPool,
+ )
+
+ with connectable.connect() as connection:
+ context.configure(
+ connection=connection, target_metadata=target_metadata
+ )
+
+ with context.begin_transaction():
+ context.run_migrations()
+
+
+if context.is_offline_mode():
+ run_migrations_offline()
+else:
+ run_migrations_online()
diff --git a/alembic/script.py.mako b/alembic/script.py.mako
new file mode 100644
index 0000000..55df286
--- /dev/null
+++ b/alembic/script.py.mako
@@ -0,0 +1,24 @@
+"""${message}
+
+Revision ID: ${up_revision}
+Revises: ${down_revision | comma,n}
+Create Date: ${create_date}
+
+"""
+from alembic import op
+import sqlalchemy as sa
+${imports if imports else ""}
+
+# revision identifiers, used by Alembic.
+revision = ${repr(up_revision)}
+down_revision = ${repr(down_revision)}
+branch_labels = ${repr(branch_labels)}
+depends_on = ${repr(depends_on)}
+
+
+def upgrade() -> None:
+ ${upgrades if upgrades else "pass"}
+
+
+def downgrade() -> None:
+ ${downgrades if downgrades else "pass"}
diff --git a/alembic/versions/4a3773e332a0_use_utc_datetime_in_db.py.old b/alembic/versions/4a3773e332a0_use_utc_datetime_in_db.py.old
new file mode 100644
index 0000000..e011a3a
--- /dev/null
+++ b/alembic/versions/4a3773e332a0_use_utc_datetime_in_db.py.old
@@ -0,0 +1,110 @@
+"""use UTC DateTime in DB
+
+Revision ID: 4a3773e332a0
+Revises:
+Create Date: 2022-11-15 17:35:11.717714
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = '4a3773e332a0'
+down_revision = None
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+ # ### commands auto generated by Alembic - please adjust! ###
+ op.add_column('borrows', sa.Column('purchase_utc', sa.DateTime(), nullable=True))
+ op.add_column('borrows', sa.Column('expiration_utc', sa.DateTime(), nullable=True))
+ op.add_column('borrows', sa.Column('purchase_timezone', sa.Integer(), nullable=True))
+ op.add_column('borrows', sa.Column('expiration_timezone', sa.Integer(), nullable=True))
+ borrows = sa.Table(
+ "borrows",
+ sa.MetaData(),
+ sa.Column("id", sa.Integer, primary_key=True, nullable=False),
+ sa.Column("purchase", sa.String, nullable=True),
+ sa.Column("expiration", sa.String, nullable=True),
+ sa.Column("purchase_utc", sa.DateTime, nullable=True),
+ sa.Column("expiration_utc", sa.DateTime, nullable=True),
+ sa.Column("purchase_timezone", sa.Integer, nullable=True),
+ sa.Column("expiration_timezone", sa.Integer, nullable=True)
+ )
+ connection = op.get_bind()
+ results = connection.execute(sa.select([
+ borrows.c.id,
+ borrows.c.purchase,
+ borrows.c.expiration,
+ borrows.c.purchase_utc,
+ borrows.c.expiration_utc,
+ borrows.c.purchase_timezone,
+ borrows.c.expiration_timezone
+ ])).fetchall()
+ from datetime import datetime, timezone
+ for id, purchase, expiration, purchase_utc, expiration_utc, purchase_timezone, expiration_timezone in results:
+ if id % 1000 == 0:
+ print(f"... obdelujem id {id}", end="\r")
+ if purchase == None:
+ print(f"at id {id} purchase is None")
+ continue
+ purchase_utc = datetime.strptime(purchase, "%Y-%m-%dT%H:%M:%S%z")
+ expiration_utc = datetime.strptime(expiration, "%Y-%m-%dT%H:%M:%S%z")
+ purchase_timezone = purchase_utc.tzinfo.utcoffset(None).seconds
+ expiration_timezone = expiration_utc.tzinfo.utcoffset(None).seconds
+ purchase_utc = purchase_utc.astimezone(timezone.utc).replace(tzinfo=None)
+ expiration_utc = expiration_utc.astimezone(timezone.utc).replace(tzinfo=None)
+ connection.execute(borrows.update().where(borrows.c.id == id).values(
+ purchase_utc = purchase_utc,
+ expiration_utc = expiration_utc,
+ purchase_timezone = purchase_timezone,
+ expiration_timezone = expiration_timezone
+ ))
+ op.drop_column('borrows', 'expiration')
+ op.drop_column('borrows', 'purchase')
+ # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+ # ### commands auto generated by Alembic - please adjust! ###
+ op.add_column('borrows', sa.Column('purchase', sa.VARCHAR(), nullable=True))
+ op.add_column('borrows', sa.Column('expiration', sa.VARCHAR(), nullable=True))
+ borrows = sa.Table(
+ "borrows",
+ sa.MetaData(),
+ sa.Column("id", sa.Integer, primary_key=True, nullable=False),
+ sa.Column("purchase", sa.String, nullable=True),
+ sa.Column("expiration", sa.String, nullable=True),
+ sa.Column("purchase_utc", sa.DateTime, nullable=True),
+ sa.Column("expiration_utc", sa.DateTime, nullable=True),
+ sa.Column("purchase_timezone", sa.Integer, nullable=True),
+ sa.Column("expiration_timezone", sa.Integer, nullable=True)
+ )
+ connection = op.get_bind()
+ results = connection.execute(sa.select([
+ borrows.c.id,
+ borrows.c.purchase,
+ borrows.c.expiration,
+ borrows.c.purchase_utc,
+ borrows.c.expiration_utc,
+ borrows.c.purchase_timezone,
+ borrows.c.expiration_timezone
+ ])).fetchall()
+ from datetime import datetime, timezone, timedelta
+ for id, purchase, expiration, purchase_utc, expiration_utc, purchase_timezone, expiration_timezone in results:
+ if id % 1000 == 0:
+ print(f"... obdelujem id {id}", end="\r")
+ if purchase_utc == None:
+ print(f"at id {id} purchase_utc is None")
+ continue
+ connection.execute(borrows.update().where(borrows.c.id == id).values(
+ purchase = purchase_utc.astimezone(timezone(timedelta(seconds=purchase_timezone))).isoformat(),
+ expiration = expiration_utc.astimezone(timezone(timedelta(seconds=expiration_timezone))).isoformat()
+ ))
+ op.drop_column('borrows', 'expiration_timezone')
+ op.drop_column('borrows', 'purchase_timezone')
+ op.drop_column('borrows', 'expiration_utc')
+ op.drop_column('borrows', 'purchase_utc')
+ # ### end Alembic commands ###
diff --git a/gather.py b/app.py
index c213d70..198eff9 100755
--- a/gather.py
+++ b/app.py
@@ -6,7 +6,7 @@ import requests
from base64 import b64decode
from datetime import datetime, timedelta, timezone
try:
- from sqlalchemy import Table, MetaData, Integer, BigInteger, String, Column, Table, ForeignKey, create_engine, select
+ from sqlalchemy import Table, MetaData, Integer, BigInteger, String, Column, Table, ForeignKey, create_engine, select, DateTime
from sqlalchemy.orm import declarative_base, relationship, Session
except ModuleNotFoundError:
raise ModuleNotFoundError("emerge dev-python/sqlalchemy or pip install SQLAlchemy")
@@ -15,11 +15,7 @@ try:
except ModuleNotFoundError:
raise ModuleNotFoundError("emerge dev-python/beautifulsoup4 or pip install beautifulsoup4")
-if len(argv) != 1+2:
- raise ValueError("1st argument specifies the db URL in sqlalchemy format. example for sqlite: sqlite:///db, 2nd argument is operator contact that's sent via http in user-agent, for example mailto:email@address")
-
operator_contact = argv[2]
-engine = create_engine(argv[1], echo=True, future=True)
Base = declarative_base()
@@ -42,25 +38,24 @@ class Borrow(Base):
id = Column(Integer, primary_key=True, nullable=False, doc="id in transaction element of acsm or in filename of acsm on http")
isbn = Column(ForeignKey("books.isbn"), nullable=False, doc="foreign key that leads to a book")
transaction = Column(String, nullable=True, doc="transaction element content, but only if it couldn't be derived from format ACS-BIBL-L-{acsm_id}, otherwise Null")
- purchase = Column(String, nullable=True, doc="acsm purchase element: iso8601 of purchase of book, including timezone")
- expiration = Column(String, nullable=True, doc="acsm expiration element: iso8601 of expiration of acsm, including timezone")
+ purchase_utc = Column(DateTime, nullable=True, doc="acsm purchase element excluding timezone in UTC")
+ expiration_utc = Column(DateTime, nullable=True, doc="acsm expiration element excluding timezone in UTC")
+ purchase_timezone = Column(Integer, nullable=True, doc="acsm purchase element timezone offset from UTC in seconds (note that purchase is UTC)")
+ expiration_timezone = Column(Integer, nullable=True, doc="acsm expiration element timezone offset from UTC in seconds (note that expiration is UTC)")
obtained = Column(BigInteger, nullable=False, doc="UNIX timestamp when this borrow was obtained as acsm from http")
+ duration = Column(Integer, nullable=True, doc="duration in seconds that a DRM client may make the book available")
book = relationship("Book", back_populates="borrows")
def __repr__(self):
- return f"Borrow(id={self.id!r}, isbn={self.isbn!r}, purchase={self.purchase!r}, expiration={self.expiration!r}, obtained=mktime({localtime(self.obtained)!r}), book={self.book!r})"
+ return f"Borrow(id={self.id!r}, isbn={self.isbn!r}, purchase={self.purchase_utc!r}, purchase_timezone={self.purchase_timezone!r} expiration={self.expiration_utc!r}, expiration_timezone={self.expiration_timezone!r}, obtained=mktime({localtime(self.obtained)!r}), duration={self.duration!r}, book={self.book!r})"
logging.basicConfig(level=logging.NOTSET)
logger = logging.getLogger(argv[0])
logger.debug("welcome to %s", argv[0])
-Base.metadata.create_all(engine)
-
starting_acsm_id = 177238
guaranteed_large_acsm_id = 1170487
-logger.debug(f"created metadata.")
-
-def update(hmfan2iarts=100):
+def update(engine, hmfan2iarts=100):
force_acsm_id = 0
valid_acsms = 0
only_isbn_acsms = 0
@@ -85,7 +80,7 @@ def update(hmfan2iarts=100):
if (r.status_code == 200):
failed_acsms_not200_in_a_row = 0
if r.status_code != 200:
- if datetime.strptime(borrow.purchase, "%Y-%m-%dT%H:%M:%S%z") > datetime.now(timezone.utc) - timedelta(hours=1):
+ if borrow.purchase_utc > datetime.now(timezone.utc) - timedelta(hours=1):
logger.info(f"we are done for now, as server responded with {r.status_code} for queried acsm id {acsm_id} and the last requested acsm was created less than an hour ago")
break
logger.warning(f"received http response with error code not 200 (it is {r.status_code}). if this continues for {hmfan2iarts-failed_acsms_not200_in_a_row} more requests, I'll assume there are no more borrows on the server.")
@@ -158,8 +153,12 @@ def update(hmfan2iarts=100):
publisher = ft.resourceItemInfo.metadata.publisher.string
language = ft.resourceItemInfo.metadata.language.string
format = ft.resourceItemInfo.metadata.format.string
- purchase = ft.purchase.string
- expiration = ft.expiration.string
+ purchase_utc = datetime.strptime(ft.purchase.string, "%Y-%m-%dT%H:%M:%S%z")
+ expiration_utc = datetime.strptime(ft.expiration.string, "%Y-%m-%dT%H:%M:%S%z")
+ purchase_timezone = purchase_utc.tzinfo.utcoffset(None).seconds
+ expiration_timezone = expiration_utc.tzinfo.utcoffset(None).seconds
+ purchase_utc = purchase_utc.astimezone(timezone.utc).replace(tzinfo=None)
+ expiration_utc = expiration_utc.astimezone(timezone.utc).replace(tzinfo=None)
if identifier_is_isbn:
identifier = None
book = session.get(Book, isbn)
@@ -174,15 +173,21 @@ def update(hmfan2iarts=100):
book.thumbnail_extension = thumbnail_extension
book.language = language
book.format = format
- borrow = Borrow(id=acsm_id, isbn=isbn, purchase=purchase, expiration=expiration, obtained=int(time()), book=book, transaction=transaction)
+ borrow = Borrow(id=acsm_id, isbn=isbn, purchase_utc=purchase_utc, expiration_utc=expiration_utc, obtained=int(time()), book=book, transaction=transaction, purchase_timezone=purchase_timezone, expiration_timezone=expiration_timezone, duration=duration)
logger.info(f"found a new {borrow!r}")
session.add(borrow)
session.commit()
valid_acsms += 1
logger.info(f"In this update, {valid_acsms} valid acsms were stored, {only_isbn_acsms} acsms had only isbn and no other data available and {failed_acsms} acsms failed to be received with response code 200 and {failed_acsms_not200} acsms failed to be received but did not return 200. Last valid requested acsm was {acsm_id}. Thank you for cooperation.")
return {"valid_acsms": valid_acsms, "only_isbn_acsms": only_isbn_acsms, "failed_acsms": failed_acsms, "failed_acsms_not200": failed_acsms_not200, "acsm_id": acsm_id}
-try:
- r = update()
-except KeyboardInterrupt:
- logger.warning(f"Keyboard interrupt. Exiting. I hope this terminated cleanly. Last requested acsm was discarded.")
+if __name__ == "__main__":
+ if len(argv) != 1+2:
+ raise ValueError("1st argument specifies the db URL in sqlalchemy format. example for sqlite: sqlite:///db, 2nd argument is operator contact that's sent via http in user-agent, for example mailto:email@address")
+ engine = create_engine(argv[1], echo=True, future=True)
+ Base.metadata.create_all(engine)
+ logger.debug(f"created metadata.")
+ try:
+ r = update(engine)
+ except KeyboardInterrupt:
+ logger.warning(f"Keyboard interrupt. Exiting. I hope this terminated cleanly. Last requested acsm was discarded.")