diff --git a/myeongsung/app/api/experience_extraction_v2.py b/myeongsung/app/api/experience_extraction_v2.py new file mode 100644 index 0000000..c29ec6d --- /dev/null +++ b/myeongsung/app/api/experience_extraction_v2.py @@ -0,0 +1,120 @@ +import json +from typing import List, Optional + +from fastapi import APIRouter, File, Form, HTTPException, UploadFile +from pydantic import TypeAdapter + +from app.schemas.resume_dto import ( + ExperiencePresetSchema, + ExperienceSummary, + MergeExperiencePayload, + Step2V2ExtractionResponse, +) + + +router = APIRouter() + + +def extract_step2_v2_from_text(text, selected_experiences, preset_schemas): + from app.services.experience_extraction_service import ( + extract_step2_v2_from_text as implementation, + ) + + return implementation(text, selected_experiences, preset_schemas) + + +def extract_step2_v2_from_url(url, selected_experiences, preset_schemas): + from app.services.experience_extraction_service import ( + extract_step2_v2_from_url as implementation, + ) + + return implementation(url, selected_experiences, preset_schemas) + + +def extract_step2_v2_from_pdf(file_content, selected_experiences, preset_schemas): + from app.services.experience_extraction_service import ( + extract_step2_v2_from_pdf as implementation, + ) + + return implementation(file_content, selected_experiences, preset_schemas) + + +def apply_sequential_merge_results_to_step2(experiences, existing_experiences): + from app.services.experience_merge_service import ( + apply_sequential_merge_results_to_step2 as implementation, + ) + + return implementation(experiences, existing_experiences) + + +@router.post( + "/extract-experiences/step2-v2", + response_model=Step2V2ExtractionResponse, +) +async def extract_experiences_step2_v2( + file: Optional[UploadFile] = File(None), + url: Optional[str] = Form(None), + text: Optional[str] = Form(None), + selected_experiences: str = Form(...), + existing_experiences: str = Form("[]"), + preset_schemas: str = Form(...), +): + if not file and not (url and url.strip()) and not (text and text.strip()): + raise HTTPException( + status_code=400, + detail="file (업로드 파일), url, text 중 최소 하나는 제공되어야 합니다.", + ) + + try: + selected_list = TypeAdapter(List[ExperienceSummary]).validate_python( + json.loads(selected_experiences) + ) + existing_list = TypeAdapter(List[MergeExperiencePayload]).validate_python( + json.loads(existing_experiences) + ) + preset_list = TypeAdapter(List[ExperiencePresetSchema]).validate_python( + json.loads(preset_schemas) + ) + except Exception as e: + raise HTTPException( + status_code=400, + detail=f"Step2 V2 요청 JSON 파싱 오류: {str(e)}", + ) from e + + try: + if file and file.filename: + file_content = await file.read() + if file.filename.lower().endswith(".pdf"): + result = extract_step2_v2_from_pdf( + file_content, + selected_list, + preset_list, + ) + else: + result = extract_step2_v2_from_text( + file_content.decode("utf-8"), + selected_list, + preset_list, + ) + elif url and url.strip(): + result = extract_step2_v2_from_url( + url.strip(), + selected_list, + preset_list, + ) + else: + result = extract_step2_v2_from_text( + text.strip(), + selected_list, + preset_list, + ) + + result.experiences = apply_sequential_merge_results_to_step2( + result.experiences, + existing_list, + ) + return result + except ValueError as e: + raise HTTPException(status_code=400, detail=str(e)) from e + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) from e diff --git a/myeongsung/app/api/router.py b/myeongsung/app/api/router.py index 02e8557..5682c3f 100644 --- a/myeongsung/app/api/router.py +++ b/myeongsung/app/api/router.py @@ -23,6 +23,7 @@ Step2ExtractionResponse, ) +from app.api.experience_extraction_v2 import router as experience_extraction_v2_router from app.services.resume_service import create_workflow, parse_and_validate_experiences from app.services.job_analysis_service import analyze_job_url from app.services.pdf_analysis_service import analyze_job_pdf @@ -38,11 +39,15 @@ extract_step2_from_url, extract_step2_from_pdf, ) -from app.services.experience_merge_service import apply_merge_results_to_step2, check_merge_candidates +from app.services.experience_merge_service import ( + apply_merge_results_to_step2, + check_merge_candidates, +) from app.services.eval_service import log_evaluation router = APIRouter() +router.include_router(experience_extraction_v2_router) workflow = create_workflow() diff --git a/myeongsung/app/services/experience_extraction_service.py b/myeongsung/app/services/experience_extraction_service.py index d634616..c963778 100644 --- a/myeongsung/app/services/experience_extraction_service.py +++ b/myeongsung/app/services/experience_extraction_service.py @@ -4,8 +4,17 @@ import fitz # PyMuPDF from langchain_openai import ChatOpenAI from langchain_core.prompts import ChatPromptTemplate -from typing import List -from app.schemas.resume_dto import ExperienceExtractionResponse, Step1ExtractionResponse, ExperienceSummary, Step2ExtractionResponse, Step2ExtractedExperience +from typing import Any, List, Optional +from app.schemas.resume_dto import ( + ExperienceExtractionResponse, + ExperiencePresetSchema, + ExperienceSummary, + Step1ExtractionResponse, + Step2ExtractionResponse, + Step2ExtractedExperience, + Step2V2ExtractionResponse, +) +from app.services.experience_preset_service import build_dynamic_step2_model def extract_step1_from_text(text: str) -> Step1ExtractionResponse: """ 텍스트에서 1차 경험 추출 (상세 증빙형 / 스펙 증빙형 분류 및 경험명 추출) @@ -173,14 +182,8 @@ def extract_experiences_from_pdf(file_content: bytes) -> ExperienceExtractionRes except Exception as e: raise ValueError(f"PDF 분석 중 오류가 발생했습니다: {str(e)}") -def extract_step2_from_text(text: str, selected_experiences: List[ExperienceSummary]) -> Step2ExtractionResponse: - """ - 1차 추출에서 사용자가 선택한 경험 리스트를 바탕으로, 각 경험의 소분류에 맞는 맞춤형 필드를 원문에서 추출합니다. - (TPM 제한 방지를 위해 각 경험별로 개별 추출 후 병합합니다.) - """ - llm = ChatOpenAI(model="gpt-4o", temperature=0) - - prompt = ChatPromptTemplate.from_messages([ +def _step2_prompt() -> ChatPromptTemplate: + return ChatPromptTemplate.from_messages([ ("system", ( "당신은 사용자의 원문 텍스트에서 특정 경험의 상세 항목을 추출하는 전문가입니다.\n" "사용자가 제공하는 '선택된 경험'에 대하여 원문에서 해당하는 내용을 찾아 지정된 스키마에 맞게 상세 정보를 추출하세요.\n\n" @@ -204,6 +207,15 @@ def extract_step2_from_text(text: str, selected_experiences: List[ExperienceSumm )), ("user", "다음은 원문 텍스트입니다:\n\n\n{text}\n\n\n다음은 상세 내용을 추출해야 할 선택된 경험입니다:\n{selected_experience}") ]) + + +def extract_step2_from_text(text: str, selected_experiences: List[ExperienceSummary]) -> Step2ExtractionResponse: + """ + 1차 추출에서 사용자가 선택한 경험 리스트를 바탕으로, 각 경험의 소분류에 맞는 맞춤형 필드를 원문에서 추출합니다. + (TPM 제한 방지를 위해 각 경험별로 개별 추출 후 병합합니다.) + """ + llm = ChatOpenAI(model="gpt-4o", temperature=0) + prompt = _step2_prompt() chain = prompt | llm.with_structured_output(Step2ExtractedExperience) @@ -226,6 +238,53 @@ def extract_step2_from_text(text: str, selected_experiences: List[ExperienceSumm return Step2ExtractionResponse(experiences=extracted_experiences) + +def extract_step2_v2_from_text( + text: str, + selected_experiences: List[ExperienceSummary], + preset_schemas: List[ExperiencePresetSchema], + llm: Optional[Any] = None, +) -> Step2V2ExtractionResponse: + preset_by_type = { + preset.experience_type_name: preset + for preset in preset_schemas + } + prompt = _step2_prompt() + extraction_llm = llm or ChatOpenAI(model="gpt-4o", temperature=0) + extracted_experiences = [] + + for experience in selected_experiences: + preset = preset_by_type.get(experience.experience_type) + if preset is None: + raise ValueError( + f"'{experience.experience_type}' 경험의 프리셋 스키마가 없습니다." + ) + output_model = build_dynamic_step2_model(experience, preset) + chain = prompt | extraction_llm.with_structured_output(output_model) + try: + result = chain.invoke( + { + "text": text, + "selected_experience": experience.model_dump(), + }, + config={ + "run_name": f"experience-step2-v2-extraction-{experience.experience_name}", + "tags": ["experience-extraction", "step2-v2"], + }, + ) + except Exception as e: + raise ValueError( + f"'{experience.experience_name}' 2차 V2 경험 추출 중 오류가 발생했습니다: {str(e)}" + ) + + if result.experience_type != experience.experience_type: + raise ValueError("AI 응답의 경험 유형이 선택 경험과 일치하지 않습니다.") + if result.experience_group != experience.experience_group: + raise ValueError("AI 응답의 경험 그룹이 선택 경험과 일치하지 않습니다.") + extracted_experiences.append(result.model_dump()) + + return Step2V2ExtractionResponse(experiences=extracted_experiences) + def extract_step2_from_url(url: str, selected_experiences: List[ExperienceSummary]) -> Step2ExtractionResponse: try: response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10) @@ -266,3 +325,58 @@ def extract_step2_from_pdf(file_content: bytes, selected_experiences: List[Exper except Exception as e: raise ValueError(f"PDF 분석 중 오류가 발생했습니다: {str(e)}") + +def extract_step2_v2_from_url( + url: str, + selected_experiences: List[ExperienceSummary], + preset_schemas: List[ExperiencePresetSchema], +) -> Step2V2ExtractionResponse: + try: + response = requests.get( + url, + headers={"User-Agent": "Mozilla/5.0"}, + timeout=10, + ) + response.raise_for_status() + + content_type = response.headers.get("Content-Type", "").lower() + if "application/pdf" in content_type or url.lower().split("?")[0].endswith(".pdf"): + doc = fitz.open(stream=response.content, filetype="pdf") + full_text = "\n".join(page.get_text() for page in doc) + else: + soup = BeautifulSoup(response.text, "html.parser") + for script in soup(["script", "style"]): + script.decompose() + raw_text = soup.get_text(separator="\n") + lines = (line.strip() for line in raw_text.splitlines()) + chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) + full_text = "\n".join(chunk for chunk in chunks if chunk) + + if not full_text.strip(): + raise ValueError("URL에서 유의미한 텍스트를 추출하지 못했습니다.") + return extract_step2_v2_from_text( + full_text, + selected_experiences, + preset_schemas, + ) + except Exception as e: + raise ValueError(f"URL 분석 중 오류가 발생했습니다: {str(e)}") + + +def extract_step2_v2_from_pdf( + file_content: bytes, + selected_experiences: List[ExperienceSummary], + preset_schemas: List[ExperiencePresetSchema], +) -> Step2V2ExtractionResponse: + try: + doc = fitz.open(stream=file_content, filetype="pdf") + full_text = "\n".join(page.get_text() for page in doc) + if not full_text.strip(): + raise ValueError("PDF에서 유의미한 텍스트를 추출하지 못했습니다.") + return extract_step2_v2_from_text( + full_text, + selected_experiences, + preset_schemas, + ) + except Exception as e: + raise ValueError(f"PDF 분석 중 오류가 발생했습니다: {str(e)}") diff --git a/myeongsung/docs/ai-harness/user-flows.md b/myeongsung/docs/ai-harness/user-flows.md index 1f75b4d..369e28f 100644 --- a/myeongsung/docs/ai-harness/user-flows.md +++ b/myeongsung/docs/ai-harness/user-flows.md @@ -56,8 +56,8 @@ Open decisions: Confirm any new experience classification types before adding them. ## Flow: Experience Extraction Step2 -Date: 2026-06-02 -Status: draft +Date: 2026-06-20 +Status: changed User action: User selects step1 experience candidates for detailed extraction. @@ -66,22 +66,28 @@ Spring API: `POST /api/experiences/extract/step2`. FastAPI API: -`POST /api/v1/extract-experiences/step2`. +`POST /api/v1/extract-experiences/step2` or +`POST /api/v1/extract-experiences/step2-v2`. Input source: -Original file, URL, or text plus `selected_experiences`. +Original file, URL, or text plus `selected_experiences`. V2 also requires +Spring `PresetRegistry` schemas and accepts existing experiences. Service flow: FastAPI extracts detailed fields for each selected experience and applies merge candidate detection when existing experiences are provided. +V2 builds each `basic_info` output model from the runtime preset, rejects undeclared fields, +and checks each result against existing experiences plus earlier accepted results in selection order. External APIs: May use LLM, embeddings, URL parsing, and document parsing services. Response: Detailed experiences including `basic_info`, keywords, content, and merge metadata. +Batch-local merge candidates use IDs in the form `batch:{selected_index}` for Spring to resolve. Failure cases: -Missing source, invalid `selected_experiences` JSON, invalid existing experience payload, external API failure, timeout. +Missing source, invalid request JSON, missing or mismatched preset schema, undeclared `basic_info` +field, external API failure, or timeout. Spring compatibility: Response must remain compatible with Spring `AiStep2Response`. diff --git a/myeongsung/tests/test_experience_step2_v2_api.py b/myeongsung/tests/test_experience_step2_v2_api.py new file mode 100644 index 0000000..71109a8 --- /dev/null +++ b/myeongsung/tests/test_experience_step2_v2_api.py @@ -0,0 +1,181 @@ +import json +import subprocess +import sys +import unittest +from unittest.mock import patch + + +def _fastapi_runtime_available() -> bool: + try: + completed = subprocess.run( + [ + sys.executable, + "-c", + "import unicodedata, fastapi, httpx", + ], + check=False, + capture_output=True, + timeout=10, + ) + return completed.returncode == 0 + except subprocess.TimeoutExpired: + return False + + +@unittest.skipUnless( + _fastapi_runtime_available(), + "현재 Python 런타임에서 FastAPI 네이티브 의존성을 로드할 수 없습니다.", +) +class ExperienceStep2V2ApiTest(unittest.TestCase): + + def setUp(self): + from fastapi import FastAPI + from fastapi.testclient import TestClient + from app.api.experience_extraction_v2 import router + + app = FastAPI() + app.include_router(router, prefix="/api/v1") + self.client = TestClient(app) + + def test_step2_v2_parses_spring_multipart_contract_and_returns_merge_shape(self): + from app.schemas.resume_dto import Step2V2ExtractionResponse + + captured = {} + + def fake_extract(url, selected, presets): + captured["url"] = url + captured["selected"] = selected + captured["presets"] = presets + return Step2V2ExtractionResponse(experiences=[{ + "experience_name": "캡스톤 프로젝트", + "experience_group": "상세 서술형", + "experience_type": "프로젝트", + "keywords": ["실행력"], + "basic_info": { + "project_name": "캡스톤 프로젝트", + "period": "2026.01 ~ 2026.06", + }, + "experience_content": "추천 모델을 개발했습니다.", + }]) + + def fake_merge(experiences, existing): + captured["existing"] = existing + experiences[0]["needs_merge"] = True + experiences[0]["merge_candidate_id"] = "existing-1" + experiences[0]["merge_similarity"] = 0.92 + return experiences + + with patch( + "app.api.experience_extraction_v2.extract_step2_v2_from_url", + side_effect=fake_extract, + ), patch( + "app.api.experience_extraction_v2.apply_sequential_merge_results_to_step2", + side_effect=fake_merge, + ): + response = self.client.post( + "/api/v1/extract-experiences/step2-v2", + data={ + "url": "https://cdn.example.com/resume.pdf", + "selected_experiences": json.dumps([{ + "experience_name": "캡스톤 프로젝트", + "experience_group": "상세 서술형", + "experience_type": "프로젝트", + }], ensure_ascii=False), + "existing_experiences": json.dumps([{ + "id": "existing-1", + "title": "기존 프로젝트", + "experience_group": "상세 서술형", + "experience_type": "프로젝트", + }], ensure_ascii=False), + "preset_schemas": json.dumps([{ + "experience_group": "상세 서술형", + "experience_type": "PROJECT", + "experience_type_name": "프로젝트", + "fields": [ + {"key": "project_name", "label": "프로젝트명"}, + {"key": "period", "label": "진행 기간"}, + ], + }], ensure_ascii=False), + }, + ) + + self.assertEqual(200, response.status_code) + self.assertEqual( + "https://cdn.example.com/resume.pdf", + captured["url"], + ) + self.assertEqual( + "캡스톤 프로젝트", + captured["selected"][0].experience_name, + ) + self.assertEqual( + "PROJECT", + captured["presets"][0].experience_type, + ) + self.assertEqual("existing-1", captured["existing"][0].id) + body = response.json()["experiences"][0] + self.assertTrue(body["needs_merge"]) + self.assertEqual("existing-1", body["merge_candidate_id"]) + self.assertEqual(0.92, body["merge_similarity"]) + self.assertEqual( + "캡스톤 프로젝트", + body["basic_info"]["project_name"], + ) + + def test_step2_v2_rejects_invalid_preset_json(self): + response = self.client.post( + "/api/v1/extract-experiences/step2-v2", + data={ + "url": "https://cdn.example.com/resume.pdf", + "selected_experiences": "[]", + "existing_experiences": "[]", + "preset_schemas": "{invalid-json", + }, + ) + + self.assertEqual(400, response.status_code) + self.assertIn( + "Step2 V2 요청 JSON 파싱 오류", + response.json()["detail"], + ) + + def test_step2_v2_returns_400_when_selected_experience_and_preset_mismatch(self): + from app.services.experience_preset_service import ( + build_dynamic_step2_model, + ) + + def mismatched_extract(url, selected, presets): + build_dynamic_step2_model(selected[0], presets[0]) + + with patch( + "app.api.experience_extraction_v2.extract_step2_v2_from_url", + side_effect=mismatched_extract, + ): + response = self.client.post( + "/api/v1/extract-experiences/step2-v2", + data={ + "url": "https://cdn.example.com/resume.pdf", + "selected_experiences": json.dumps([{ + "experience_name": "토익", + "experience_group": "스펙·증빙형", + "experience_type": "어학", + }], ensure_ascii=False), + "existing_experiences": "[]", + "preset_schemas": json.dumps([{ + "experience_group": "상세 서술형", + "experience_type": "LANGUAGE", + "experience_type_name": "어학", + "fields": [], + }], ensure_ascii=False), + }, + ) + + self.assertEqual(400, response.status_code) + self.assertEqual( + "선택 경험과 프리셋의 경험 그룹이 일치하지 않습니다.", + response.json()["detail"], + ) + + +if __name__ == "__main__": + unittest.main()