diff --git a/hospexplorer/ask/admin.py b/hospexplorer/ask/admin.py index 71a3023..4daeabd 100644 --- a/hospexplorer/ask/admin.py +++ b/hospexplorer/ask/admin.py @@ -296,7 +296,7 @@ class WebsiteResourceAdmin(KBDeleteAdminMixin, admin.ModelAdmin): (None, {"fields": ("title", "description", "url")}), ("Metadata", {"fields": ( "date_published", - "document_type", "document_author_institution", "institution_type", "publisher" + "document_type", "document_author_institution", "institution_type", "publisher", )}), ("Status", {"fields": ( "status", "status_message", "mcp_kb_document_id", @@ -396,6 +396,7 @@ def _apply_zip_csv_metadata(obj, row): if publisher_raw: obj.publisher = publisher_raw + for column, model in ZIP_CSV_LOOKUP_COLUMNS.items(): value = (row.get(column) or "").strip() if not value: @@ -503,6 +504,9 @@ def zip_upload_view(self, request): messages.error(request, "Please select a zip file to upload.") return HttpResponseRedirect(request.path) + update_file = bool(request.POST.get("update_file")) + update_metadata = bool(request.POST.get("update_metadata")) + try: archive = zipfile.ZipFile(zip_file) except zipfile.BadZipFile: @@ -554,7 +558,9 @@ def _is_real(name): total = 0 saved = 0 - queued_ids = [] + updated = 0 + queued_new_ids = [] + queued_replace_ids = [] for row in reader: total += 1 filename = (row.get(filename_col) or "").strip() @@ -567,19 +573,45 @@ def _is_real(name): continue basename = os.path.basename(filename) - if (basename, title) in existing_pdfs: - messages.warning(request, f"Row {total}: '{filename}' already exists; skipped.") - continue + is_update = (basename, title) in existing_pdfs - member = zip_members.get(filename) or zip_members.get(basename) - if not member: - messages.warning(request, f"Row {total}: '{filename}' not in zip; skipped.") + if is_update and not (update_file or update_metadata): + messages.warning(request, f"Row {total}: '{filename}' already exists; skipped.") continue - try: - pdf_bytes = archive.read(member) - except KeyError: - messages.warning(request, f"Row {total}: could not read '{filename}'; skipped.") + # only read PDF bytes when we'll actually use them: new rows + # always need them; existing rows only when update_file is set + pdf_bytes = None + if (not is_update) or update_file: + member = zip_members.get(filename) or zip_members.get(basename) + if not member: + messages.warning(request, f"Row {total}: '{filename}' not in zip; skipped.") + continue + try: + pdf_bytes = archive.read(member) + except KeyError: + messages.warning(request, f"Row {total}: could not read '{filename}'; skipped.") + continue + + if is_update: + match = PDFResource.objects.filter( + original_filename=basename, title=title + ).first() + if match is None: + messages.warning(request, f"Row {total}: '{filename}' lookup failed; skipped.") + continue + if update_metadata: + for warning in _apply_zip_csv_metadata(match, row): + messages.warning(request, f"Row {total}: {warning}") + if update_file: + match.file.delete(save=False) + match.file.save(basename, ContentFile(pdf_bytes), save=False) + match.modifier = request.user + match.status = PDFResource.Status.PROCESSING + match.status_message = "Queued for Knowledge Base re-upload." + match.save() + updated += 1 + queued_replace_ids.append(match.pk) continue obj = PDFResource( @@ -595,23 +627,34 @@ def _is_real(name): obj.file.save(basename, ContentFile(pdf_bytes), save=True) saved += 1 existing_pdfs.add((basename, title)) - queued_ids.append(obj.pk) + queued_new_ids.append(obj.pk) # fire KB uploads after the request transaction commits so background # threads see the just-saved rows - def _start_uploads(ids=tuple(queued_ids)): - for pk in ids: + def _start_uploads( + new_ids=tuple(queued_new_ids), + replace_ids=tuple(queued_replace_ids), + ): + for pk in new_ids: + threading.Thread( + target=run_kb_resource_upload, + args=("pdf", pk), + daemon=True, + ).start() + for pk in replace_ids: threading.Thread( target=run_kb_resource_upload, args=("pdf", pk), + kwargs={"replace": True}, daemon=True, ).start() transaction.on_commit(_start_uploads) messages.success( request, - f"Imported {saved} of {total} PDFs. Knowledge Base uploads are running in the " - "background — refresh the list to see each row's final status.", + f"Imported {saved} new and updated {updated} of {total} PDF rows. " + "Knowledge Base uploads are running in the background — " + "refresh the list to see each row's final status.", ) return HttpResponseRedirect(changelist_url) diff --git a/hospexplorer/ask/migrations/0014_merge_20260522_2257.py b/hospexplorer/ask/migrations/0014_merge_20260522_2257.py new file mode 100644 index 0000000..eaaa6c1 --- /dev/null +++ b/hospexplorer/ask/migrations/0014_merge_20260522_2257.py @@ -0,0 +1,14 @@ +# Generated by Django 6.0.2 on 2026-05-22 22:57 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('ask', '0013_documentauthorinstitution_documenttype_and_more'), + ('ask', '0013_pdfresource_original_filename'), + ] + + operations = [ + ] diff --git a/hospexplorer/ask/migrations/0015_iso_date_published.py b/hospexplorer/ask/migrations/0015_iso_date_published.py new file mode 100644 index 0000000..c18bd65 --- /dev/null +++ b/hospexplorer/ask/migrations/0015_iso_date_published.py @@ -0,0 +1,65 @@ +"""Switch date_published from (DateField + precision enum) to a single +CharField holding a partial ISO 8601 date string. + +The MissingMigration plus an AlterField wouldn't carry the precision +information across, so this migration: adds a temporary CharField, copies +each row's old (date, precision) pair into a partial ISO string, removes +the old fields, then renames the temp field to the canonical name. +""" +from django.db import migrations, models + + +def _to_iso_partial(date, precision): + if date is None: + return "" + if precision == "year": + return f"{date.year:04d}" + if precision == "month": + return f"{date.year:04d}-{date.month:02d}" + # "day" — or any other / empty precision, which we treat as a full date + return date.isoformat() + + +def forwards(apps, schema_editor): + for model_name in ("PDFResource", "WebsiteResource"): + Model = apps.get_model("ask", model_name) + for obj in Model.objects.all(): + obj.date_published_iso = _to_iso_partial( + obj.date_published, obj.date_published_precision + ) + obj.save(update_fields=["date_published_iso"]) + + +class Migration(migrations.Migration): + + dependencies = [ + ("ask", "0014_merge_20260526_2133"), + ] + + operations = [ + migrations.AddField( + model_name="pdfresource", + name="date_published_iso", + field=models.CharField(blank=True, default="", max_length=10), + ), + migrations.AddField( + model_name="websiteresource", + name="date_published_iso", + field=models.CharField(blank=True, default="", max_length=10), + ), + migrations.RunPython(forwards, migrations.RunPython.noop), + migrations.RemoveField(model_name="pdfresource", name="date_published"), + migrations.RemoveField(model_name="websiteresource", name="date_published"), + migrations.RemoveField(model_name="pdfresource", name="date_published_precision"), + migrations.RemoveField(model_name="websiteresource", name="date_published_precision"), + migrations.RenameField( + model_name="pdfresource", + old_name="date_published_iso", + new_name="date_published", + ), + migrations.RenameField( + model_name="websiteresource", + old_name="date_published_iso", + new_name="date_published", + ), + ] diff --git a/hospexplorer/ask/tasks.py b/hospexplorer/ask/tasks.py index 42edfcd..a6eff2c 100644 --- a/hospexplorer/ask/tasks.py +++ b/hospexplorer/ask/tasks.py @@ -164,15 +164,20 @@ def _build_resource_metadata(obj): } -def run_kb_resource_upload(model_label, resource_id): +def run_kb_resource_upload(model_label, resource_id, replace=False): """Background thread: push a resource to the MCP KB and record its doc_id. Runs outside the admin's atomic save transaction so a slow or timing-out MCP call can't roll back the local row. The object's status/status_message are updated at each phase so the admin can surface progress and errors. + + When ``replace`` is True and the row already has an ``mcp_kb_document_id``, + the existing KB doc is deleted before the new one is added — used by the + zip importer when an "update file" / "update metadata" re-upload would + otherwise leave the old chunks in the KB alongside the new ones. """ from ask.models import WebsiteResource, PDFResource, Resource - from ask.kb_connector import add_pdf_to_kb, add_website_to_kb + from ask.kb_connector import add_pdf_to_kb, add_website_to_kb, delete_kb_document if model_label == "pdf": Model = PDFResource @@ -188,6 +193,19 @@ def run_kb_resource_upload(model_label, resource_id): logger.error("run_kb_resource_upload: %s id=%s not found", model_label, resource_id) return + if replace and obj.mcp_kb_document_id: + # best-effort: if delete fails the re-add still happens, leaving a + # stale duplicate in the KB rather than losing the new upload + old_doc_id = obj.mcp_kb_document_id + try: + delete_kb_document(old_doc_id) + except Exception: + logger.warning( + "run_kb_resource_upload: failed to delete old KB doc_id=%s for %s id=%s; " + "re-adding anyway", old_doc_id, model_label, resource_id, + ) + obj.mcp_kb_document_id = None + try: metadata = _build_resource_metadata(obj) if model_label == "pdf": diff --git a/hospexplorer/ask/templates/admin/ask/pdfresource/upload_zip.html b/hospexplorer/ask/templates/admin/ask/pdfresource/upload_zip.html index 0bfe104..e6bd9d5 100644 --- a/hospexplorer/ask/templates/admin/ask/pdfresource/upload_zip.html +++ b/hospexplorer/ask/templates/admin/ask/pdfresource/upload_zip.html @@ -30,6 +30,18 @@
++ +
++ +