diff --git a/hospexplorer/ask/admin.py b/hospexplorer/ask/admin.py index 71a3023..4daeabd 100644 --- a/hospexplorer/ask/admin.py +++ b/hospexplorer/ask/admin.py @@ -296,7 +296,7 @@ class WebsiteResourceAdmin(KBDeleteAdminMixin, admin.ModelAdmin): (None, {"fields": ("title", "description", "url")}), ("Metadata", {"fields": ( "date_published", - "document_type", "document_author_institution", "institution_type", "publisher" + "document_type", "document_author_institution", "institution_type", "publisher", )}), ("Status", {"fields": ( "status", "status_message", "mcp_kb_document_id", @@ -396,6 +396,7 @@ def _apply_zip_csv_metadata(obj, row): if publisher_raw: obj.publisher = publisher_raw + for column, model in ZIP_CSV_LOOKUP_COLUMNS.items(): value = (row.get(column) or "").strip() if not value: @@ -503,6 +504,9 @@ def zip_upload_view(self, request): messages.error(request, "Please select a zip file to upload.") return HttpResponseRedirect(request.path) + update_file = bool(request.POST.get("update_file")) + update_metadata = bool(request.POST.get("update_metadata")) + try: archive = zipfile.ZipFile(zip_file) except zipfile.BadZipFile: @@ -554,7 +558,9 @@ def _is_real(name): total = 0 saved = 0 - queued_ids = [] + updated = 0 + queued_new_ids = [] + queued_replace_ids = [] for row in reader: total += 1 filename = (row.get(filename_col) or "").strip() @@ -567,19 +573,45 @@ def _is_real(name): continue basename = os.path.basename(filename) - if (basename, title) in existing_pdfs: - messages.warning(request, f"Row {total}: '{filename}' already exists; skipped.") - continue + is_update = (basename, title) in existing_pdfs - member = zip_members.get(filename) or zip_members.get(basename) - if not member: - messages.warning(request, f"Row {total}: '{filename}' not in zip; skipped.") + if is_update and not (update_file or update_metadata): + messages.warning(request, f"Row {total}: '{filename}' already exists; skipped.") continue - try: - pdf_bytes = archive.read(member) - except KeyError: - messages.warning(request, f"Row {total}: could not read '{filename}'; skipped.") + # only read PDF bytes when we'll actually use them: new rows + # always need them; existing rows only when update_file is set + pdf_bytes = None + if (not is_update) or update_file: + member = zip_members.get(filename) or zip_members.get(basename) + if not member: + messages.warning(request, f"Row {total}: '{filename}' not in zip; skipped.") + continue + try: + pdf_bytes = archive.read(member) + except KeyError: + messages.warning(request, f"Row {total}: could not read '{filename}'; skipped.") + continue + + if is_update: + match = PDFResource.objects.filter( + original_filename=basename, title=title + ).first() + if match is None: + messages.warning(request, f"Row {total}: '{filename}' lookup failed; skipped.") + continue + if update_metadata: + for warning in _apply_zip_csv_metadata(match, row): + messages.warning(request, f"Row {total}: {warning}") + if update_file: + match.file.delete(save=False) + match.file.save(basename, ContentFile(pdf_bytes), save=False) + match.modifier = request.user + match.status = PDFResource.Status.PROCESSING + match.status_message = "Queued for Knowledge Base re-upload." + match.save() + updated += 1 + queued_replace_ids.append(match.pk) continue obj = PDFResource( @@ -595,23 +627,34 @@ def _is_real(name): obj.file.save(basename, ContentFile(pdf_bytes), save=True) saved += 1 existing_pdfs.add((basename, title)) - queued_ids.append(obj.pk) + queued_new_ids.append(obj.pk) # fire KB uploads after the request transaction commits so background # threads see the just-saved rows - def _start_uploads(ids=tuple(queued_ids)): - for pk in ids: + def _start_uploads( + new_ids=tuple(queued_new_ids), + replace_ids=tuple(queued_replace_ids), + ): + for pk in new_ids: + threading.Thread( + target=run_kb_resource_upload, + args=("pdf", pk), + daemon=True, + ).start() + for pk in replace_ids: threading.Thread( target=run_kb_resource_upload, args=("pdf", pk), + kwargs={"replace": True}, daemon=True, ).start() transaction.on_commit(_start_uploads) messages.success( request, - f"Imported {saved} of {total} PDFs. Knowledge Base uploads are running in the " - "background — refresh the list to see each row's final status.", + f"Imported {saved} new and updated {updated} of {total} PDF rows. " + "Knowledge Base uploads are running in the background — " + "refresh the list to see each row's final status.", ) return HttpResponseRedirect(changelist_url) diff --git a/hospexplorer/ask/migrations/0014_merge_20260522_2257.py b/hospexplorer/ask/migrations/0014_merge_20260522_2257.py new file mode 100644 index 0000000..eaaa6c1 --- /dev/null +++ b/hospexplorer/ask/migrations/0014_merge_20260522_2257.py @@ -0,0 +1,14 @@ +# Generated by Django 6.0.2 on 2026-05-22 22:57 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('ask', '0013_documentauthorinstitution_documenttype_and_more'), + ('ask', '0013_pdfresource_original_filename'), + ] + + operations = [ + ] diff --git a/hospexplorer/ask/migrations/0015_iso_date_published.py b/hospexplorer/ask/migrations/0015_iso_date_published.py new file mode 100644 index 0000000..c18bd65 --- /dev/null +++ b/hospexplorer/ask/migrations/0015_iso_date_published.py @@ -0,0 +1,65 @@ +"""Switch date_published from (DateField + precision enum) to a single +CharField holding a partial ISO 8601 date string. + +The MissingMigration plus an AlterField wouldn't carry the precision +information across, so this migration: adds a temporary CharField, copies +each row's old (date, precision) pair into a partial ISO string, removes +the old fields, then renames the temp field to the canonical name. +""" +from django.db import migrations, models + + +def _to_iso_partial(date, precision): + if date is None: + return "" + if precision == "year": + return f"{date.year:04d}" + if precision == "month": + return f"{date.year:04d}-{date.month:02d}" + # "day" — or any other / empty precision, which we treat as a full date + return date.isoformat() + + +def forwards(apps, schema_editor): + for model_name in ("PDFResource", "WebsiteResource"): + Model = apps.get_model("ask", model_name) + for obj in Model.objects.all(): + obj.date_published_iso = _to_iso_partial( + obj.date_published, obj.date_published_precision + ) + obj.save(update_fields=["date_published_iso"]) + + +class Migration(migrations.Migration): + + dependencies = [ + ("ask", "0014_merge_20260526_2133"), + ] + + operations = [ + migrations.AddField( + model_name="pdfresource", + name="date_published_iso", + field=models.CharField(blank=True, default="", max_length=10), + ), + migrations.AddField( + model_name="websiteresource", + name="date_published_iso", + field=models.CharField(blank=True, default="", max_length=10), + ), + migrations.RunPython(forwards, migrations.RunPython.noop), + migrations.RemoveField(model_name="pdfresource", name="date_published"), + migrations.RemoveField(model_name="websiteresource", name="date_published"), + migrations.RemoveField(model_name="pdfresource", name="date_published_precision"), + migrations.RemoveField(model_name="websiteresource", name="date_published_precision"), + migrations.RenameField( + model_name="pdfresource", + old_name="date_published_iso", + new_name="date_published", + ), + migrations.RenameField( + model_name="websiteresource", + old_name="date_published_iso", + new_name="date_published", + ), + ] diff --git a/hospexplorer/ask/tasks.py b/hospexplorer/ask/tasks.py index 42edfcd..a6eff2c 100644 --- a/hospexplorer/ask/tasks.py +++ b/hospexplorer/ask/tasks.py @@ -164,15 +164,20 @@ def _build_resource_metadata(obj): } -def run_kb_resource_upload(model_label, resource_id): +def run_kb_resource_upload(model_label, resource_id, replace=False): """Background thread: push a resource to the MCP KB and record its doc_id. Runs outside the admin's atomic save transaction so a slow or timing-out MCP call can't roll back the local row. The object's status/status_message are updated at each phase so the admin can surface progress and errors. + + When ``replace`` is True and the row already has an ``mcp_kb_document_id``, + the existing KB doc is deleted before the new one is added — used by the + zip importer when an "update file" / "update metadata" re-upload would + otherwise leave the old chunks in the KB alongside the new ones. """ from ask.models import WebsiteResource, PDFResource, Resource - from ask.kb_connector import add_pdf_to_kb, add_website_to_kb + from ask.kb_connector import add_pdf_to_kb, add_website_to_kb, delete_kb_document if model_label == "pdf": Model = PDFResource @@ -188,6 +193,19 @@ def run_kb_resource_upload(model_label, resource_id): logger.error("run_kb_resource_upload: %s id=%s not found", model_label, resource_id) return + if replace and obj.mcp_kb_document_id: + # best-effort: if delete fails the re-add still happens, leaving a + # stale duplicate in the KB rather than losing the new upload + old_doc_id = obj.mcp_kb_document_id + try: + delete_kb_document(old_doc_id) + except Exception: + logger.warning( + "run_kb_resource_upload: failed to delete old KB doc_id=%s for %s id=%s; " + "re-adding anyway", old_doc_id, model_label, resource_id, + ) + obj.mcp_kb_document_id = None + try: metadata = _build_resource_metadata(obj) if model_label == "pdf": diff --git a/hospexplorer/ask/templates/admin/ask/pdfresource/upload_zip.html b/hospexplorer/ask/templates/admin/ask/pdfresource/upload_zip.html index 0bfe104..e6bd9d5 100644 --- a/hospexplorer/ask/templates/admin/ask/pdfresource/upload_zip.html +++ b/hospexplorer/ask/templates/admin/ask/pdfresource/upload_zip.html @@ -30,6 +30,18 @@

+

+ +

+

+ +

Cancel diff --git a/hospexplorer/ask/tests.py b/hospexplorer/ask/tests.py index 0e27975..b123d95 100644 --- a/hospexplorer/ask/tests.py +++ b/hospexplorer/ask/tests.py @@ -200,6 +200,141 @@ def test_zip_import_tolerates_whitespace_in_csv_header(self): self.assertEqual(pdf.date_published, "2021") self.assertEqual(pdf.document_type.name, "Report") + def test_zip_update_file_overwrites_existing_pdf(self): + csv_text = "filename,title\r\nreport.pdf,Annual Report\r\n" + zip1 = self._build_zip(csv_text, {"report.pdf": b"%PDF-1.4 original"}) + self.client.post(reverse("admin:ask_pdfresource_upload_zip"), {"zip_file": zip1}) + + zip2 = self._build_zip(csv_text, {"report.pdf": b"%PDF-1.4 updated"}) + response = self.client.post( + reverse("admin:ask_pdfresource_upload_zip"), + {"zip_file": zip2, "update_file": "on"}, + ) + self.assertEqual(response.status_code, 302) + + self.assertEqual(PDFResource.objects.count(), 1) + pdf = PDFResource.objects.get(title="Annual Report") + with pdf.file.open("rb") as f: + self.assertEqual(f.read(), b"%PDF-1.4 updated") + + def test_zip_update_metadata_refreshes_fields(self): + csv_text_1 = "filename,title\r\nreport.pdf,Annual Report\r\n" + zip1 = self._build_zip(csv_text_1, {"report.pdf": b"%PDF-1.4 original"}) + self.client.post(reverse("admin:ask_pdfresource_upload_zip"), {"zip_file": zip1}) + + pdf = PDFResource.objects.get(title="Annual Report") + self.assertEqual(pdf.date_published, "") + self.assertIsNone(pdf.document_type_id) + + csv_text_2 = ( + "filename,title,date_published,document_type\r\n" + "report.pdf,Annual Report,2024-03,Report\r\n" + ) + # second zip's bytes must NOT replace the file when only update_metadata is on + zip2 = self._build_zip(csv_text_2, {"report.pdf": b"%PDF-1.4 ignored"}) + response = self.client.post( + reverse("admin:ask_pdfresource_upload_zip"), + {"zip_file": zip2, "update_metadata": "on"}, + ) + self.assertEqual(response.status_code, 302) + + self.assertEqual(PDFResource.objects.count(), 1) + pdf.refresh_from_db() + self.assertEqual(pdf.date_published, "2024-03") + self.assertEqual(pdf.document_type.name, "Report") + with pdf.file.open("rb") as f: + self.assertEqual(f.read(), b"%PDF-1.4 original") + + def test_zip_no_update_flags_preserve_skip_behavior(self): + csv_text = "filename,title\r\nreport.pdf,Annual Report\r\n" + zip1 = self._build_zip(csv_text, {"report.pdf": b"%PDF-1.4 original"}) + self.client.post(reverse("admin:ask_pdfresource_upload_zip"), {"zip_file": zip1}) + + zip2 = self._build_zip(csv_text, {"report.pdf": b"%PDF-1.4 updated"}) + response = self.client.post( + reverse("admin:ask_pdfresource_upload_zip"), {"zip_file": zip2} + ) + self.assertEqual(response.status_code, 302) + + self.assertEqual(PDFResource.objects.count(), 1) + pdf = PDFResource.objects.get(title="Annual Report") + with pdf.file.open("rb") as f: + self.assertEqual(f.read(), b"%PDF-1.4 original") + + +class RunKbResourceUploadReplaceTests(TestCase): + def setUp(self): + media_root = tempfile.mkdtemp() + self.addCleanup(shutil.rmtree, media_root, ignore_errors=True) + override = override_settings(MEDIA_ROOT=media_root) + override.enable() + self.addCleanup(override.disable) + # the real run_kb_resource_upload closes DB connections in finally — + # that kills the TestCase's wrapping transaction connection and breaks + # later tests, so neutralize it for this class + close_patcher = patch("ask.tasks.close_old_connections") + close_patcher.start() + self.addCleanup(close_patcher.stop) + self.user = User.objects.create_user("curator", password="pw") + + def _make_pdf(self, mcp_id): + obj = PDFResource( + title="Annual Report", + original_filename="report.pdf", + creator=self.user, + modifier=self.user, + mcp_kb_document_id=mcp_id, + ) + obj.file.save("report.pdf", ContentFile(b"%PDF-1.4 test"), save=True) + return obj + + def test_replace_deletes_old_doc_then_re_adds(self): + from ask import kb_connector + from ask.tasks import run_kb_resource_upload + + obj = self._make_pdf(mcp_id=42) + with patch.object(kb_connector, "delete_kb_document") as mock_del, patch.object( + kb_connector, "add_pdf_to_kb", return_value={"doc_id": 99} + ) as mock_add: + run_kb_resource_upload("pdf", obj.pk, replace=True) + + mock_del.assert_called_once_with(42) + mock_add.assert_called_once() + obj.refresh_from_db() + self.assertEqual(obj.mcp_kb_document_id, 99) + self.assertEqual(obj.status, PDFResource.Status.SUCCESS) + + def test_replace_without_existing_doc_id_skips_delete(self): + from ask import kb_connector + from ask.tasks import run_kb_resource_upload + + obj = self._make_pdf(mcp_id=None) + with patch.object(kb_connector, "delete_kb_document") as mock_del, patch.object( + kb_connector, "add_pdf_to_kb", return_value={"doc_id": 99} + ) as mock_add: + run_kb_resource_upload("pdf", obj.pk, replace=True) + + mock_del.assert_not_called() + mock_add.assert_called_once() + obj.refresh_from_db() + self.assertEqual(obj.mcp_kb_document_id, 99) + + def test_replace_swallows_delete_failure_and_still_adds(self): + from ask import kb_connector + from ask.tasks import run_kb_resource_upload + + obj = self._make_pdf(mcp_id=42) + with patch.object( + kb_connector, "delete_kb_document", side_effect=Exception("boom") + ), patch.object( + kb_connector, "add_pdf_to_kb", return_value={"doc_id": 99} + ) as mock_add: + run_kb_resource_upload("pdf", obj.pk, replace=True) + + mock_add.assert_called_once() + obj.refresh_from_db() + self.assertEqual(obj.mcp_kb_document_id, 99) + class KBAddPdfResourceViewTests(TestCase): """The Track-in-Hopper endpoint for untracked KB PDFs @@ -218,7 +353,6 @@ def setUp(self): override = override_settings(MEDIA_ROOT=media_root) override.enable() self.addCleanup(override.disable) - self.user = User.objects.create_user("curator", password="pw") self.user.user_permissions.add( Permission.objects.get(codename="add_pdfresource")