Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 60 additions & 17 deletions hospexplorer/ask/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,7 @@ class WebsiteResourceAdmin(KBDeleteAdminMixin, admin.ModelAdmin):
(None, {"fields": ("title", "description", "url")}),
("Metadata", {"fields": (
"date_published",
"document_type", "document_author_institution", "institution_type", "publisher"
"document_type", "document_author_institution", "institution_type", "publisher",
)}),
("Status", {"fields": (
"status", "status_message", "mcp_kb_document_id",
Expand Down Expand Up @@ -396,6 +396,7 @@ def _apply_zip_csv_metadata(obj, row):
if publisher_raw:
obj.publisher = publisher_raw


for column, model in ZIP_CSV_LOOKUP_COLUMNS.items():
value = (row.get(column) or "").strip()
if not value:
Expand Down Expand Up @@ -503,6 +504,9 @@ def zip_upload_view(self, request):
messages.error(request, "Please select a zip file to upload.")
return HttpResponseRedirect(request.path)

update_file = bool(request.POST.get("update_file"))
update_metadata = bool(request.POST.get("update_metadata"))

try:
archive = zipfile.ZipFile(zip_file)
except zipfile.BadZipFile:
Expand Down Expand Up @@ -554,7 +558,9 @@ def _is_real(name):

total = 0
saved = 0
queued_ids = []
updated = 0
queued_new_ids = []
queued_replace_ids = []
for row in reader:
total += 1
filename = (row.get(filename_col) or "").strip()
Expand All @@ -567,19 +573,45 @@ def _is_real(name):
continue

basename = os.path.basename(filename)
if (basename, title) in existing_pdfs:
messages.warning(request, f"Row {total}: '{filename}' already exists; skipped.")
continue
is_update = (basename, title) in existing_pdfs

member = zip_members.get(filename) or zip_members.get(basename)
if not member:
messages.warning(request, f"Row {total}: '{filename}' not in zip; skipped.")
if is_update and not (update_file or update_metadata):
messages.warning(request, f"Row {total}: '{filename}' already exists; skipped.")
continue

try:
pdf_bytes = archive.read(member)
except KeyError:
messages.warning(request, f"Row {total}: could not read '{filename}'; skipped.")
# only read PDF bytes when we'll actually use them: new rows
# always need them; existing rows only when update_file is set
pdf_bytes = None
if (not is_update) or update_file:
member = zip_members.get(filename) or zip_members.get(basename)
if not member:
messages.warning(request, f"Row {total}: '{filename}' not in zip; skipped.")
continue
try:
pdf_bytes = archive.read(member)
except KeyError:
messages.warning(request, f"Row {total}: could not read '{filename}'; skipped.")
continue

if is_update:
match = PDFResource.objects.filter(
original_filename=basename, title=title
).first()
if match is None:
messages.warning(request, f"Row {total}: '{filename}' lookup failed; skipped.")
continue
if update_metadata:
for warning in _apply_zip_csv_metadata(match, row):
messages.warning(request, f"Row {total}: {warning}")
if update_file:
match.file.delete(save=False)
match.file.save(basename, ContentFile(pdf_bytes), save=False)
match.modifier = request.user
match.status = PDFResource.Status.PROCESSING
match.status_message = "Queued for Knowledge Base re-upload."
match.save()
updated += 1
queued_replace_ids.append(match.pk)
continue

obj = PDFResource(
Expand All @@ -595,23 +627,34 @@ def _is_real(name):
obj.file.save(basename, ContentFile(pdf_bytes), save=True)
saved += 1
existing_pdfs.add((basename, title))
queued_ids.append(obj.pk)
queued_new_ids.append(obj.pk)

# fire KB uploads after the request transaction commits so background
# threads see the just-saved rows
def _start_uploads(ids=tuple(queued_ids)):
for pk in ids:
def _start_uploads(
new_ids=tuple(queued_new_ids),
replace_ids=tuple(queued_replace_ids),
):
for pk in new_ids:
threading.Thread(
target=run_kb_resource_upload,
args=("pdf", pk),
daemon=True,
).start()
for pk in replace_ids:
threading.Thread(
target=run_kb_resource_upload,
args=("pdf", pk),
kwargs={"replace": True},
daemon=True,
).start()
transaction.on_commit(_start_uploads)

messages.success(
request,
f"Imported {saved} of {total} PDFs. Knowledge Base uploads are running in the "
"background — refresh the list to see each row's final status.",
f"Imported {saved} new and updated {updated} of {total} PDF rows. "
"Knowledge Base uploads are running in the background — "
"refresh the list to see each row's final status.",
)
return HttpResponseRedirect(changelist_url)

Expand Down
14 changes: 14 additions & 0 deletions hospexplorer/ask/migrations/0014_merge_20260522_2257.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Generated by Django 6.0.2 on 2026-05-22 22:57

from django.db import migrations


class Migration(migrations.Migration):

dependencies = [
('ask', '0013_documentauthorinstitution_documenttype_and_more'),
('ask', '0013_pdfresource_original_filename'),
]

operations = [
]
65 changes: 65 additions & 0 deletions hospexplorer/ask/migrations/0015_iso_date_published.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
"""Switch date_published from (DateField + precision enum) to a single
CharField holding a partial ISO 8601 date string.

The MissingMigration plus an AlterField wouldn't carry the precision
information across, so this migration: adds a temporary CharField, copies
each row's old (date, precision) pair into a partial ISO string, removes
the old fields, then renames the temp field to the canonical name.
"""
from django.db import migrations, models


def _to_iso_partial(date, precision):
if date is None:
return ""
if precision == "year":
return f"{date.year:04d}"
if precision == "month":
return f"{date.year:04d}-{date.month:02d}"
# "day" — or any other / empty precision, which we treat as a full date
return date.isoformat()


def forwards(apps, schema_editor):
for model_name in ("PDFResource", "WebsiteResource"):
Model = apps.get_model("ask", model_name)
for obj in Model.objects.all():
obj.date_published_iso = _to_iso_partial(
obj.date_published, obj.date_published_precision
)
obj.save(update_fields=["date_published_iso"])


class Migration(migrations.Migration):

dependencies = [
("ask", "0014_merge_20260526_2133"),
]

operations = [
migrations.AddField(
model_name="pdfresource",
name="date_published_iso",
field=models.CharField(blank=True, default="", max_length=10),
),
migrations.AddField(
model_name="websiteresource",
name="date_published_iso",
field=models.CharField(blank=True, default="", max_length=10),
),
migrations.RunPython(forwards, migrations.RunPython.noop),
migrations.RemoveField(model_name="pdfresource", name="date_published"),
migrations.RemoveField(model_name="websiteresource", name="date_published"),
migrations.RemoveField(model_name="pdfresource", name="date_published_precision"),
migrations.RemoveField(model_name="websiteresource", name="date_published_precision"),
migrations.RenameField(
model_name="pdfresource",
old_name="date_published_iso",
new_name="date_published",
),
migrations.RenameField(
model_name="websiteresource",
old_name="date_published_iso",
new_name="date_published",
),
]
22 changes: 20 additions & 2 deletions hospexplorer/ask/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,15 +164,20 @@ def _build_resource_metadata(obj):
}


def run_kb_resource_upload(model_label, resource_id):
def run_kb_resource_upload(model_label, resource_id, replace=False):
"""Background thread: push a resource to the MCP KB and record its doc_id.

Runs outside the admin's atomic save transaction so a slow or timing-out
MCP call can't roll back the local row. The object's status/status_message
are updated at each phase so the admin can surface progress and errors.

When ``replace`` is True and the row already has an ``mcp_kb_document_id``,
the existing KB doc is deleted before the new one is added — used by the
zip importer when an "update file" / "update metadata" re-upload would
otherwise leave the old chunks in the KB alongside the new ones.
"""
from ask.models import WebsiteResource, PDFResource, Resource
from ask.kb_connector import add_pdf_to_kb, add_website_to_kb
from ask.kb_connector import add_pdf_to_kb, add_website_to_kb, delete_kb_document

if model_label == "pdf":
Model = PDFResource
Expand All @@ -188,6 +193,19 @@ def run_kb_resource_upload(model_label, resource_id):
logger.error("run_kb_resource_upload: %s id=%s not found", model_label, resource_id)
return

if replace and obj.mcp_kb_document_id:
# best-effort: if delete fails the re-add still happens, leaving a
# stale duplicate in the KB rather than losing the new upload
old_doc_id = obj.mcp_kb_document_id
try:
delete_kb_document(old_doc_id)
except Exception:
logger.warning(
"run_kb_resource_upload: failed to delete old KB doc_id=%s for %s id=%s; "
"re-adding anyway", old_doc_id, model_label, resource_id,
)
obj.mcp_kb_document_id = None

try:
metadata = _build_resource_metadata(obj)
if model_label == "pdf":
Expand Down
12 changes: 12 additions & 0 deletions hospexplorer/ask/templates/admin/ask/pdfresource/upload_zip.html
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,18 @@
<label for="zip_file">Zip file:</label>
<input type="file" name="zip_file" id="zip_file" accept=".zip" required>
</p>
<p>
<label>
<input type="checkbox" name="update_file" id="update_file">
Update file: overwrite existing PDFs (same filename and title) with the newly uploaded files.
</label>
</p>
<p>
<label>
<input type="checkbox" name="update_metadata" id="update_metadata">
Update metadata: refresh metadata on existing PDFs (same filename and title).
</label>
</p>
<div class="submit-row">
<input type="submit" value="Upload" class="default">
<a href="{{ changelist_url }}" class="closelink">Cancel</a>
Expand Down
Loading