diff --git a/myeongsung/app/api/experience_extraction_v2.py b/myeongsung/app/api/experience_extraction_v2.py new file mode 100644 index 0000000..c29ec6d --- /dev/null +++ b/myeongsung/app/api/experience_extraction_v2.py @@ -0,0 +1,120 @@ +import json +from typing import List, Optional + +from fastapi import APIRouter, File, Form, HTTPException, UploadFile +from pydantic import TypeAdapter + +from app.schemas.resume_dto import ( + ExperiencePresetSchema, + ExperienceSummary, + MergeExperiencePayload, + Step2V2ExtractionResponse, +) + + +router = APIRouter() + + +def extract_step2_v2_from_text(text, selected_experiences, preset_schemas): + from app.services.experience_extraction_service import ( + extract_step2_v2_from_text as implementation, + ) + + return implementation(text, selected_experiences, preset_schemas) + + +def extract_step2_v2_from_url(url, selected_experiences, preset_schemas): + from app.services.experience_extraction_service import ( + extract_step2_v2_from_url as implementation, + ) + + return implementation(url, selected_experiences, preset_schemas) + + +def extract_step2_v2_from_pdf(file_content, selected_experiences, preset_schemas): + from app.services.experience_extraction_service import ( + extract_step2_v2_from_pdf as implementation, + ) + + return implementation(file_content, selected_experiences, preset_schemas) + + +def apply_sequential_merge_results_to_step2(experiences, existing_experiences): + from app.services.experience_merge_service import ( + apply_sequential_merge_results_to_step2 as implementation, + ) + + return implementation(experiences, existing_experiences) + + +@router.post( + "/extract-experiences/step2-v2", + response_model=Step2V2ExtractionResponse, +) +async def extract_experiences_step2_v2( + file: Optional[UploadFile] = File(None), + url: Optional[str] = Form(None), + text: Optional[str] = Form(None), + selected_experiences: str = Form(...), + existing_experiences: str = Form("[]"), + preset_schemas: str = Form(...), +): + if not file and not (url and url.strip()) and not (text and text.strip()): + raise HTTPException( + status_code=400, + detail="file (업로드 파일), url, text 중 최소 하나는 제공되어야 합니다.", + ) + + try: + selected_list = TypeAdapter(List[ExperienceSummary]).validate_python( + json.loads(selected_experiences) + ) + existing_list = TypeAdapter(List[MergeExperiencePayload]).validate_python( + json.loads(existing_experiences) + ) + preset_list = TypeAdapter(List[ExperiencePresetSchema]).validate_python( + json.loads(preset_schemas) + ) + except Exception as e: + raise HTTPException( + status_code=400, + detail=f"Step2 V2 요청 JSON 파싱 오류: {str(e)}", + ) from e + + try: + if file and file.filename: + file_content = await file.read() + if file.filename.lower().endswith(".pdf"): + result = extract_step2_v2_from_pdf( + file_content, + selected_list, + preset_list, + ) + else: + result = extract_step2_v2_from_text( + file_content.decode("utf-8"), + selected_list, + preset_list, + ) + elif url and url.strip(): + result = extract_step2_v2_from_url( + url.strip(), + selected_list, + preset_list, + ) + else: + result = extract_step2_v2_from_text( + text.strip(), + selected_list, + preset_list, + ) + + result.experiences = apply_sequential_merge_results_to_step2( + result.experiences, + existing_list, + ) + return result + except ValueError as e: + raise HTTPException(status_code=400, detail=str(e)) from e + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) from e diff --git a/myeongsung/app/api/router.py b/myeongsung/app/api/router.py index 02e8557..5682c3f 100644 --- a/myeongsung/app/api/router.py +++ b/myeongsung/app/api/router.py @@ -23,6 +23,7 @@ Step2ExtractionResponse, ) +from app.api.experience_extraction_v2 import router as experience_extraction_v2_router from app.services.resume_service import create_workflow, parse_and_validate_experiences from app.services.job_analysis_service import analyze_job_url from app.services.pdf_analysis_service import analyze_job_pdf @@ -38,11 +39,15 @@ extract_step2_from_url, extract_step2_from_pdf, ) -from app.services.experience_merge_service import apply_merge_results_to_step2, check_merge_candidates +from app.services.experience_merge_service import ( + apply_merge_results_to_step2, + check_merge_candidates, +) from app.services.eval_service import log_evaluation router = APIRouter() +router.include_router(experience_extraction_v2_router) workflow = create_workflow() diff --git a/myeongsung/app/schemas/resume_dto.py b/myeongsung/app/schemas/resume_dto.py index 3adfd54..aaf148f 100644 --- a/myeongsung/app/schemas/resume_dto.py +++ b/myeongsung/app/schemas/resume_dto.py @@ -1,6 +1,95 @@ -from pydantic import BaseModel, Field -from typing import List, Optional, Union, Any, Dict from datetime import datetime +from typing import List, Optional, Union, Any, Dict + +from pydantic import BaseModel, ConfigDict, Field, model_validator + + +PRESET_FIELDS: Dict[str, List[str]] = { + "프로젝트": ["project_name", "period", "role", "organization", "achievements"], + "PROJECT": ["project_name", "period", "role", "organization", "achievements"], + "대외활동": ["activity_name", "organization", "period", "role", "achievements"], + "ACTIVITY": ["activity_name", "organization", "period", "role", "achievements"], + "인턴/직무경험": ["organization", "department", "period", "task", "achievements"], + "INTERN": ["organization", "department", "period", "task", "achievements"], + "공모전": ["competition_name", "organization", "period", "role", "achievements"], + "CONTEST": ["competition_name", "organization", "period", "role", "achievements"], + "봉사활동": ["activity_name", "organization", "period", "role"], + "VOLUNTEER": ["activity_name", "organization", "period", "role"], + "교환학생": ["location", "organization", "period", "major"], + "EXCHANGE": ["location", "organization", "period", "major"], + "알바": ["workplace_name", "period", "work_type", "task", "key_experience"], + "ALBA": ["workplace_name", "period", "work_type", "task", "key_experience"], + "학부연구생": ["lab_name", "organization", "period", "research_topic", "role", "deliverables"], + "RESEARCH": ["lab_name", "organization", "period", "research_topic", "role", "deliverables"], + "어학": ["exam_name", "score", "exam_date", "expiration_date", "score_report"], + "LANGUAGE": ["exam_name", "score", "exam_date", "expiration_date", "score_report"], + "자격증": ["certificate_name", "organization", "acquisition_date", "expiration_date", "certificate_copy"], + "LICENSE": ["certificate_name", "organization", "acquisition_date", "expiration_date", "certificate_copy"], + "수상": ["award_name", "organization", "award_date", "award_grade", "award_proof"], + "AWARD": ["award_name", "organization", "award_date", "award_grade", "award_proof"], + "수강과목": ["course_name", "semester", "credit", "grade", "major"], + "COURSE": ["course_name", "semester", "credit", "grade", "major"], + "교육 이수": ["education_name", "organization", "period", "completion_status", "completion_certificate"], + "EDUCATION": ["education_name", "organization", "period", "completion_status", "completion_certificate"], +} + +PRESET_LABEL_ALIASES: Dict[str, Dict[str, str]] = { + "프로젝트": {"프로젝트명": "project_name", "진행 기간": "period", "역할": "role", "소속/팀": "organization", "주요 성과": "achievements"}, + "PROJECT": {"프로젝트명": "project_name", "진행 기간": "period", "역할": "role", "소속/팀": "organization", "주요 성과": "achievements"}, + "대외활동": {"활동명": "activity_name", "주관기관": "organization", "활동 기간": "period", "역할": "role", "주요 성과": "achievements"}, + "ACTIVITY": {"활동명": "activity_name", "주관기관": "organization", "활동 기간": "period", "역할": "role", "주요 성과": "achievements"}, + "인턴/직무경험": {"회사/기관명": "organization", "직무/부서": "department", "근무/참여 기간": "period", "담당 업무": "task", "주요 성과": "achievements"}, + "INTERN": {"회사/기관명": "organization", "직무/부서": "department", "근무/참여 기간": "period", "담당 업무": "task", "주요 성과": "achievements"}, + "공모전": {"공모전명": "competition_name", "주관기관": "organization", "참가 기간": "period", "역할": "role", "수상/결과": "achievements"}, + "CONTEST": {"공모전명": "competition_name", "주관기관": "organization", "참가 기간": "period", "역할": "role", "수상/결과": "achievements"}, + "봉사활동": {"활동명": "activity_name", "기관/단체": "organization", "활동 기간": "period", "역할": "role"}, + "VOLUNTEER": {"활동명": "activity_name", "기관/단체": "organization", "활동 기간": "period", "역할": "role"}, + "교환학생": {"국가/도시": "location", "학교명": "organization", "파견 기간": "period", "전공/수강 분야": "major"}, + "EXCHANGE": {"국가/도시": "location", "학교명": "organization", "파견 기간": "period", "전공/수강 분야": "major"}, + "알바": {"근무처명": "workplace_name", "근무 기간": "period", "업무 유형": "work_type", "담당 업무": "task", "주요 경험": "key_experience"}, + "ALBA": {"근무처명": "workplace_name", "근무 기간": "period", "업무 유형": "work_type", "담당 업무": "task", "주요 경험": "key_experience"}, + "학부연구생": {"연구실명": "lab_name", "소속 기관": "organization", "참여 기간": "period", "연구 주제": "research_topic", "담당 역할": "role", "주요 결과물": "deliverables"}, + "RESEARCH": {"연구실명": "lab_name", "소속 기관": "organization", "참여 기간": "period", "연구 주제": "research_topic", "담당 역할": "role", "주요 결과물": "deliverables"}, + "어학": {"시험명": "exam_name", "점수/등급": "score", "응시일": "exam_date", "유효기간": "expiration_date", "성적표": "score_report"}, + "LANGUAGE": {"시험명": "exam_name", "점수/등급": "score", "응시일": "exam_date", "유효기간": "expiration_date", "성적표": "score_report"}, + "자격증": {"자격증명": "certificate_name", "발급기관": "organization", "취득일": "acquisition_date", "유효기간": "expiration_date", "자격증 사본": "certificate_copy"}, + "LICENSE": {"자격증명": "certificate_name", "발급기관": "organization", "취득일": "acquisition_date", "유효기간": "expiration_date", "자격증 사본": "certificate_copy"}, + "수상": {"수상명": "award_name", "수여기관": "organization", "수상일": "award_date", "수상 구분": "award_grade", "수상 증빙": "award_proof"}, + "AWARD": {"수상명": "award_name", "수여기관": "organization", "수상일": "award_date", "수상 구분": "award_grade", "수상 증빙": "award_proof"}, + "수강과목": {"과목명": "course_name", "이수 학기": "semester", "학점": "credit", "성적": "grade", "관련 분야": "major"}, + "COURSE": {"과목명": "course_name", "이수 학기": "semester", "학점": "credit", "성적": "grade", "관련 분야": "major"}, + "교육 이수": {"교육명": "education_name", "운영기관": "organization", "교육 기간": "period", "수료 여부": "completion_status", "수료증": "completion_certificate"}, + "EDUCATION": {"교육명": "education_name", "운영기관": "organization", "교육 기간": "period", "수료 여부": "completion_status", "수료증": "completion_certificate"}, +} + + +def _clean_preset_key(value: str) -> str: + return value.replace(" ", "").replace("_", "").strip().lower() + + +def normalize_basic_info(experience_type: str, basic_info: Any) -> Dict[str, Any]: + if isinstance(basic_info, BaseModel): + raw_info = basic_info.model_dump() + elif isinstance(basic_info, dict): + raw_info = basic_info + else: + return {} + + allowed_keys = PRESET_FIELDS.get(experience_type) + if not allowed_keys: + return raw_info + + alias_map = PRESET_LABEL_ALIASES.get(experience_type, {}) + cleaned_aliases = {_clean_preset_key(key): value for key, value in alias_map.items()} + cleaned_allowed = {_clean_preset_key(key): key for key in allowed_keys} + + normalized = {} + for raw_key, value in raw_info.items(): + key = str(raw_key) + normalized_key = cleaned_allowed.get(_clean_preset_key(key)) or cleaned_aliases.get(_clean_preset_key(key)) + if normalized_key in allowed_keys: + normalized[normalized_key] = value + return normalized # ── STAR 경험 입력 스키마 ────────────────────────────────────── class StarContent(BaseModel): @@ -65,83 +154,120 @@ class ExperienceSummary(BaseModel): class Step1ExtractionResponse(BaseModel): experiences: List[ExperienceSummary] = Field(..., description="1차 추출된 경험 목록") +class PresetFieldDefinition(BaseModel): + key: str = Field(..., description="Spring PresetRegistry의 필드 키") + label: str = Field(..., description="필드 한글 라벨") + + +class ExperiencePresetSchema(BaseModel): + experience_group: str = Field(..., description="경험 대분류 한글명") + experience_type: str = Field(..., description="Spring ExperienceType enum 코드") + experience_type_name: str = Field(..., description="경험 소분류 한글명") + fields: List[PresetFieldDefinition] = Field( + default_factory=list, + description="해당 경험 유형에서 허용하는 basic_info 필드", + ) + # ── 2차 추출 (소분류별 맞춤 스키마) ────────────────────────────────────── # [1] 상세 서술형 -class ProjectInfo(BaseModel): +class PresetInfo(BaseModel): + model_config = ConfigDict(extra="forbid") + + +class ProjectInfo(PresetInfo): project_name: Optional[str] = Field(None, description="프로젝트명") period: Optional[str] = Field(None, description="진행 기간") role: Optional[str] = Field(None, description="역할") organization: Optional[str] = Field(None, description="소속/팀") achievements: Optional[str] = Field(None, description="주요 성과") -class ActivityInfo(BaseModel): +class ActivityInfo(PresetInfo): activity_name: Optional[str] = Field(None, description="활동명") organization: Optional[str] = Field(None, description="주관기관") period: Optional[str] = Field(None, description="활동 기간") role: Optional[str] = Field(None, description="역할") achievements: Optional[str] = Field(None, description="주요 성과") -class InternInfo(BaseModel): +class InternInfo(PresetInfo): organization: Optional[str] = Field(None, description="회사/기관명") department: Optional[str] = Field(None, description="직무/부서") period: Optional[str] = Field(None, description="근무/참여 기간") task: Optional[str] = Field(None, description="담당 업무") achievements: Optional[str] = Field(None, description="주요 성과") -class CompetitionInfo(BaseModel): +class CompetitionInfo(PresetInfo): competition_name: Optional[str] = Field(None, description="공모전명") organization: Optional[str] = Field(None, description="주관기관") period: Optional[str] = Field(None, description="참가 기간") role: Optional[str] = Field(None, description="역할") achievements: Optional[str] = Field(None, description="수상/결과") -class VolunteerInfo(BaseModel): +class VolunteerInfo(PresetInfo): activity_name: Optional[str] = Field(None, description="활동명") organization: Optional[str] = Field(None, description="기관/단체") period: Optional[str] = Field(None, description="활동 기간") role: Optional[str] = Field(None, description="역할") -class ExchangeInfo(BaseModel): +class ExchangeInfo(PresetInfo): location: Optional[str] = Field(None, description="국가/도시") organization: Optional[str] = Field(None, description="학교명") period: Optional[str] = Field(None, description="파견 기간") major: Optional[str] = Field(None, description="전공/수강 분야") +class AlbaInfo(PresetInfo): + workplace_name: Optional[str] = Field(None, description="근무처명") + period: Optional[str] = Field(None, description="근무 기간") + work_type: Optional[str] = Field(None, description="업무 유형") + task: Optional[str] = Field(None, description="담당 업무") + key_experience: Optional[str] = Field(None, description="주요 경험") + +class ResearchInfo(PresetInfo): + lab_name: Optional[str] = Field(None, description="연구실명") + organization: Optional[str] = Field(None, description="소속 기관") + period: Optional[str] = Field(None, description="참여 기간") + research_topic: Optional[str] = Field(None, description="연구 주제") + role: Optional[str] = Field(None, description="담당 역할") + deliverables: Optional[str] = Field(None, description="주요 결과물") + # [2] 스펙·증빙형 -class LanguageInfo(BaseModel): +class LanguageInfo(PresetInfo): exam_name: Optional[str] = Field(None, description="시험명") score: Optional[str] = Field(None, description="점수/등급") exam_date: Optional[str] = Field(None, description="응시일") expiration_date: Optional[str] = Field(None, description="유효기간") + score_report: Optional[str] = Field(None, description="성적표") -class CertificateInfo(BaseModel): +class CertificateInfo(PresetInfo): certificate_name: Optional[str] = Field(None, description="자격증명") organization: Optional[str] = Field(None, description="발급기관") acquisition_date: Optional[str] = Field(None, description="취득일") expiration_date: Optional[str] = Field(None, description="유효기간") + certificate_copy: Optional[str] = Field(None, description="자격증 사본") -class AwardInfo(BaseModel): +class AwardInfo(PresetInfo): award_name: Optional[str] = Field(None, description="수상명") organization: Optional[str] = Field(None, description="수여기관") award_date: Optional[str] = Field(None, description="수상일") award_grade: Optional[str] = Field(None, description="수상 구분") + award_proof: Optional[str] = Field(None, description="수상 증빙") -class CourseInfo(BaseModel): +class CourseInfo(PresetInfo): course_name: Optional[str] = Field(None, description="과목명") semester: Optional[str] = Field(None, description="이수 학기") credit: Optional[str] = Field(None, description="학점") grade: Optional[str] = Field(None, description="성적") major: Optional[str] = Field(None, description="관련 분야") -class EducationInfo(BaseModel): +class EducationInfo(PresetInfo): education_name: Optional[str] = Field(None, description="교육명") organization: Optional[str] = Field(None, description="운영기관") period: Optional[str] = Field(None, description="교육 기간") completion_status: Optional[str] = Field(None, description="수료 여부") + completion_certificate: Optional[str] = Field(None, description="수료증") BasicInfoUnion = Union[ - ProjectInfo, ActivityInfo, InternInfo, CompetitionInfo, VolunteerInfo, ExchangeInfo, + ProjectInfo, ActivityInfo, InternInfo, CompetitionInfo, VolunteerInfo, ExchangeInfo, AlbaInfo, ResearchInfo, LanguageInfo, CertificateInfo, AwardInfo, CourseInfo, EducationInfo ] @@ -179,9 +305,32 @@ class Step2ExtractedExperience(BaseModel): merge_similarity: Optional[float] = Field(default=None, description="병합 후보와의 임베딩 유사도") writing_status: str = Field(default="in_progress", description="작성 종료 여부") + @model_validator(mode="before") + @classmethod + def normalize_basic_info_before_validation(cls, data: Any) -> Any: + if not isinstance(data, dict): + return data + experience_type = data.get("experience_type") or data.get("experienceType") + if experience_type and "basic_info" in data: + normalized = dict(data) + normalized["basic_info"] = normalize_basic_info(str(experience_type), data.get("basic_info")) + return normalized + return data + + @model_validator(mode="after") + def normalize_basic_info_after_validation(self): + self.basic_info = normalize_basic_info(self.experience_type, self.basic_info) + return self + class Step2ExtractionResponse(BaseModel): experiences: List[Step2ExtractedExperience] = Field(..., description="2차 추출된 경험 상세 목록") +class Step2V2ExtractionResponse(BaseModel): + experiences: List[Dict[str, Any]] = Field( + ..., + description="Spring 런타임 프리셋으로 검증된 2차 추출 결과", + ) + # ── 경험 병합 후보 검사 스키마 ────────────────────────────────────── class MergeExperiencePayload(BaseModel): diff --git a/myeongsung/app/services/experience_extraction_service.py b/myeongsung/app/services/experience_extraction_service.py index d634616..c963778 100644 --- a/myeongsung/app/services/experience_extraction_service.py +++ b/myeongsung/app/services/experience_extraction_service.py @@ -4,8 +4,17 @@ import fitz # PyMuPDF from langchain_openai import ChatOpenAI from langchain_core.prompts import ChatPromptTemplate -from typing import List -from app.schemas.resume_dto import ExperienceExtractionResponse, Step1ExtractionResponse, ExperienceSummary, Step2ExtractionResponse, Step2ExtractedExperience +from typing import Any, List, Optional +from app.schemas.resume_dto import ( + ExperienceExtractionResponse, + ExperiencePresetSchema, + ExperienceSummary, + Step1ExtractionResponse, + Step2ExtractionResponse, + Step2ExtractedExperience, + Step2V2ExtractionResponse, +) +from app.services.experience_preset_service import build_dynamic_step2_model def extract_step1_from_text(text: str) -> Step1ExtractionResponse: """ 텍스트에서 1차 경험 추출 (상세 증빙형 / 스펙 증빙형 분류 및 경험명 추출) @@ -173,14 +182,8 @@ def extract_experiences_from_pdf(file_content: bytes) -> ExperienceExtractionRes except Exception as e: raise ValueError(f"PDF 분석 중 오류가 발생했습니다: {str(e)}") -def extract_step2_from_text(text: str, selected_experiences: List[ExperienceSummary]) -> Step2ExtractionResponse: - """ - 1차 추출에서 사용자가 선택한 경험 리스트를 바탕으로, 각 경험의 소분류에 맞는 맞춤형 필드를 원문에서 추출합니다. - (TPM 제한 방지를 위해 각 경험별로 개별 추출 후 병합합니다.) - """ - llm = ChatOpenAI(model="gpt-4o", temperature=0) - - prompt = ChatPromptTemplate.from_messages([ +def _step2_prompt() -> ChatPromptTemplate: + return ChatPromptTemplate.from_messages([ ("system", ( "당신은 사용자의 원문 텍스트에서 특정 경험의 상세 항목을 추출하는 전문가입니다.\n" "사용자가 제공하는 '선택된 경험'에 대하여 원문에서 해당하는 내용을 찾아 지정된 스키마에 맞게 상세 정보를 추출하세요.\n\n" @@ -204,6 +207,15 @@ def extract_step2_from_text(text: str, selected_experiences: List[ExperienceSumm )), ("user", "다음은 원문 텍스트입니다:\n\n\n{text}\n\n\n다음은 상세 내용을 추출해야 할 선택된 경험입니다:\n{selected_experience}") ]) + + +def extract_step2_from_text(text: str, selected_experiences: List[ExperienceSummary]) -> Step2ExtractionResponse: + """ + 1차 추출에서 사용자가 선택한 경험 리스트를 바탕으로, 각 경험의 소분류에 맞는 맞춤형 필드를 원문에서 추출합니다. + (TPM 제한 방지를 위해 각 경험별로 개별 추출 후 병합합니다.) + """ + llm = ChatOpenAI(model="gpt-4o", temperature=0) + prompt = _step2_prompt() chain = prompt | llm.with_structured_output(Step2ExtractedExperience) @@ -226,6 +238,53 @@ def extract_step2_from_text(text: str, selected_experiences: List[ExperienceSumm return Step2ExtractionResponse(experiences=extracted_experiences) + +def extract_step2_v2_from_text( + text: str, + selected_experiences: List[ExperienceSummary], + preset_schemas: List[ExperiencePresetSchema], + llm: Optional[Any] = None, +) -> Step2V2ExtractionResponse: + preset_by_type = { + preset.experience_type_name: preset + for preset in preset_schemas + } + prompt = _step2_prompt() + extraction_llm = llm or ChatOpenAI(model="gpt-4o", temperature=0) + extracted_experiences = [] + + for experience in selected_experiences: + preset = preset_by_type.get(experience.experience_type) + if preset is None: + raise ValueError( + f"'{experience.experience_type}' 경험의 프리셋 스키마가 없습니다." + ) + output_model = build_dynamic_step2_model(experience, preset) + chain = prompt | extraction_llm.with_structured_output(output_model) + try: + result = chain.invoke( + { + "text": text, + "selected_experience": experience.model_dump(), + }, + config={ + "run_name": f"experience-step2-v2-extraction-{experience.experience_name}", + "tags": ["experience-extraction", "step2-v2"], + }, + ) + except Exception as e: + raise ValueError( + f"'{experience.experience_name}' 2차 V2 경험 추출 중 오류가 발생했습니다: {str(e)}" + ) + + if result.experience_type != experience.experience_type: + raise ValueError("AI 응답의 경험 유형이 선택 경험과 일치하지 않습니다.") + if result.experience_group != experience.experience_group: + raise ValueError("AI 응답의 경험 그룹이 선택 경험과 일치하지 않습니다.") + extracted_experiences.append(result.model_dump()) + + return Step2V2ExtractionResponse(experiences=extracted_experiences) + def extract_step2_from_url(url: str, selected_experiences: List[ExperienceSummary]) -> Step2ExtractionResponse: try: response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10) @@ -266,3 +325,58 @@ def extract_step2_from_pdf(file_content: bytes, selected_experiences: List[Exper except Exception as e: raise ValueError(f"PDF 분석 중 오류가 발생했습니다: {str(e)}") + +def extract_step2_v2_from_url( + url: str, + selected_experiences: List[ExperienceSummary], + preset_schemas: List[ExperiencePresetSchema], +) -> Step2V2ExtractionResponse: + try: + response = requests.get( + url, + headers={"User-Agent": "Mozilla/5.0"}, + timeout=10, + ) + response.raise_for_status() + + content_type = response.headers.get("Content-Type", "").lower() + if "application/pdf" in content_type or url.lower().split("?")[0].endswith(".pdf"): + doc = fitz.open(stream=response.content, filetype="pdf") + full_text = "\n".join(page.get_text() for page in doc) + else: + soup = BeautifulSoup(response.text, "html.parser") + for script in soup(["script", "style"]): + script.decompose() + raw_text = soup.get_text(separator="\n") + lines = (line.strip() for line in raw_text.splitlines()) + chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) + full_text = "\n".join(chunk for chunk in chunks if chunk) + + if not full_text.strip(): + raise ValueError("URL에서 유의미한 텍스트를 추출하지 못했습니다.") + return extract_step2_v2_from_text( + full_text, + selected_experiences, + preset_schemas, + ) + except Exception as e: + raise ValueError(f"URL 분석 중 오류가 발생했습니다: {str(e)}") + + +def extract_step2_v2_from_pdf( + file_content: bytes, + selected_experiences: List[ExperienceSummary], + preset_schemas: List[ExperiencePresetSchema], +) -> Step2V2ExtractionResponse: + try: + doc = fitz.open(stream=file_content, filetype="pdf") + full_text = "\n".join(page.get_text() for page in doc) + if not full_text.strip(): + raise ValueError("PDF에서 유의미한 텍스트를 추출하지 못했습니다.") + return extract_step2_v2_from_text( + full_text, + selected_experiences, + preset_schemas, + ) + except Exception as e: + raise ValueError(f"PDF 분석 중 오류가 발생했습니다: {str(e)}") diff --git a/myeongsung/app/services/experience_merge_service.py b/myeongsung/app/services/experience_merge_service.py index f449288..65c0b83 100644 --- a/myeongsung/app/services/experience_merge_service.py +++ b/myeongsung/app/services/experience_merge_service.py @@ -406,3 +406,32 @@ def apply_merge_results_to_step2( experience.merge_similarity = result.similarity return step2_experiences + + +def apply_sequential_merge_results_to_step2( + step2_experiences: List[dict], + existing_experiences: List[MergeExperiencePayload], + threshold: Optional[float] = None, + embedding_client: Optional[Any] = None, +) -> List[dict]: + accepted_candidates: List[Any] = list(existing_experiences) + + for index, experience in enumerate(step2_experiences): + merge_response = check_merge_candidates( + targets=[experience], + existing_experiences=accepted_candidates, + threshold=threshold, + top_k=1, + embedding_client=embedding_client, + ) + result = merge_response.results[0] + experience["needs_merge"] = result.needs_merge + experience["merge_candidate_id"] = result.merge_candidate_id + experience["merge_similarity"] = result.similarity + + if not result.needs_merge: + accepted_candidate = dict(experience) + accepted_candidate["id"] = f"batch:{index}" + accepted_candidates.append(accepted_candidate) + + return step2_experiences diff --git a/myeongsung/app/services/experience_preset_service.py b/myeongsung/app/services/experience_preset_service.py new file mode 100644 index 0000000..b72e1d1 --- /dev/null +++ b/myeongsung/app/services/experience_preset_service.py @@ -0,0 +1,50 @@ +import re +from typing import Optional + +from pydantic import ConfigDict, Field, create_model + +from app.schemas.resume_dto import ( + ExperiencePresetSchema, + ExperienceSummary, + Step2ExtractedExperience, +) + + +def build_dynamic_step2_model( + selected_experience: ExperienceSummary, + preset_schema: ExperiencePresetSchema, +): + if preset_schema.experience_type_name != selected_experience.experience_type: + raise ValueError("선택 경험과 프리셋의 경험 유형이 일치하지 않습니다.") + if preset_schema.experience_group != selected_experience.experience_group: + raise ValueError("선택 경험과 프리셋의 경험 그룹이 일치하지 않습니다.") + + basic_info_fields = { + field.key: ( + Optional[str], + Field(default=None, description=field.label), + ) + for field in preset_schema.fields + } + safe_type_name = re.sub(r"[^0-9A-Za-z_]", "_", preset_schema.experience_type) + basic_info_model = create_model( + f"{safe_type_name}RuntimeBasicInfo", + __config__=ConfigDict(extra="forbid"), + **basic_info_fields, + ) + + experience_fields = {} + for name, field_info in Step2ExtractedExperience.model_fields.items(): + if name == "basic_info": + experience_fields[name] = ( + basic_info_model, + Field(..., description="Spring PresetRegistry 기반 유형별 기본 필드"), + ) + else: + experience_fields[name] = (field_info.annotation, field_info) + + return create_model( + f"{safe_type_name}RuntimeStep2Experience", + __config__=ConfigDict(extra="forbid"), + **experience_fields, + ) diff --git a/myeongsung/docs/ai-harness/user-flows.md b/myeongsung/docs/ai-harness/user-flows.md index 1f75b4d..369e28f 100644 --- a/myeongsung/docs/ai-harness/user-flows.md +++ b/myeongsung/docs/ai-harness/user-flows.md @@ -56,8 +56,8 @@ Open decisions: Confirm any new experience classification types before adding them. ## Flow: Experience Extraction Step2 -Date: 2026-06-02 -Status: draft +Date: 2026-06-20 +Status: changed User action: User selects step1 experience candidates for detailed extraction. @@ -66,22 +66,28 @@ Spring API: `POST /api/experiences/extract/step2`. FastAPI API: -`POST /api/v1/extract-experiences/step2`. +`POST /api/v1/extract-experiences/step2` or +`POST /api/v1/extract-experiences/step2-v2`. Input source: -Original file, URL, or text plus `selected_experiences`. +Original file, URL, or text plus `selected_experiences`. V2 also requires +Spring `PresetRegistry` schemas and accepts existing experiences. Service flow: FastAPI extracts detailed fields for each selected experience and applies merge candidate detection when existing experiences are provided. +V2 builds each `basic_info` output model from the runtime preset, rejects undeclared fields, +and checks each result against existing experiences plus earlier accepted results in selection order. External APIs: May use LLM, embeddings, URL parsing, and document parsing services. Response: Detailed experiences including `basic_info`, keywords, content, and merge metadata. +Batch-local merge candidates use IDs in the form `batch:{selected_index}` for Spring to resolve. Failure cases: -Missing source, invalid `selected_experiences` JSON, invalid existing experience payload, external API failure, timeout. +Missing source, invalid request JSON, missing or mismatched preset schema, undeclared `basic_info` +field, external API failure, or timeout. Spring compatibility: Response must remain compatible with Spring `AiStep2Response`. diff --git a/myeongsung/tests/test_experience_merge_service.py b/myeongsung/tests/test_experience_merge_service.py index 8d7b0c5..ff4321c 100644 --- a/myeongsung/tests/test_experience_merge_service.py +++ b/myeongsung/tests/test_experience_merge_service.py @@ -2,7 +2,11 @@ import unittest from app.schemas.resume_dto import MergeExperiencePayload, Step2ExtractedExperience -from app.services.experience_merge_service import build_embedding_text, check_merge_candidates +from app.services.experience_merge_service import ( + apply_sequential_merge_results_to_step2, + build_embedding_text, + check_merge_candidates, +) class FakeEmbeddings: @@ -34,6 +38,20 @@ class LowSimilarityOpenAI: embeddings = LowSimilarityEmbeddings() +class SameEmbeddings: + def create(self, model, input): + return SimpleNamespace( + data=[ + SimpleNamespace(embedding=[1.0, 0.0]) + for _ in input + ] + ) + + +class SameOpenAI: + embeddings = SameEmbeddings() + + class ExperienceMergeServiceTest(unittest.TestCase): def test_check_merge_candidates_marks_similar_target(self): @@ -185,6 +203,35 @@ def test_low_embedding_with_two_key_field_matches_marks_merge(self): self.assertTrue(response.results[0].needs_merge) self.assertEqual("exp-1", response.results[0].merge_candidate_id) + def test_sequential_merge_uses_first_non_duplicate_as_batch_candidate(self): + experiences = [ + { + "experience_name": "캡스톤 프로젝트", + "experience_group": "상세 서술형", + "experience_type": "프로젝트", + "basic_info": {"project_name": "캡스톤 프로젝트"}, + "experience_content": "추천 모델을 개발했습니다.", + }, + { + "experience_name": "캡스톤 프로젝트", + "experience_group": "상세 서술형", + "experience_type": "프로젝트", + "basic_info": {"project_name": "캡스톤 프로젝트"}, + "experience_content": "추천 모델을 개발했습니다.", + }, + ] + + result = apply_sequential_merge_results_to_step2( + experiences, + [], + threshold=0.86, + embedding_client=SameOpenAI(), + ) + + self.assertFalse(result[0]["needs_merge"]) + self.assertTrue(result[1]["needs_merge"]) + self.assertEqual("batch:0", result[1]["merge_candidate_id"]) + if __name__ == "__main__": unittest.main() diff --git a/myeongsung/tests/test_experience_step2_v2.py b/myeongsung/tests/test_experience_step2_v2.py new file mode 100644 index 0000000..44de036 --- /dev/null +++ b/myeongsung/tests/test_experience_step2_v2.py @@ -0,0 +1,68 @@ +import unittest + +from pydantic import ValidationError + +from app.schemas.resume_dto import ExperiencePresetSchema, ExperienceSummary +from app.services.experience_preset_service import build_dynamic_step2_model + + +class ExperienceStep2V2Test(unittest.TestCase): + + def test_dynamic_model_allows_only_runtime_preset_fields(self): + selected = ExperienceSummary( + experience_name="캡스톤 프로젝트", + experience_group="상세 서술형", + experience_type="프로젝트", + ) + preset = ExperiencePresetSchema( + experience_group="상세 서술형", + experience_type="PROJECT", + experience_type_name="프로젝트", + fields=[ + {"key": "project_name", "label": "프로젝트명"}, + {"key": "period", "label": "진행 기간"}, + ], + ) + model = build_dynamic_step2_model(selected, preset) + + parsed = model.model_validate({ + "experience_name": "캡스톤 프로젝트", + "experience_group": "상세 서술형", + "experience_type": "프로젝트", + "basic_info": { + "project_name": "캡스톤 프로젝트", + "period": "2026.01 ~ 2026.06", + }, + }) + + self.assertEqual("캡스톤 프로젝트", parsed.basic_info.project_name) + with self.assertRaises(ValidationError): + model.model_validate({ + "experience_name": "캡스톤 프로젝트", + "experience_group": "상세 서술형", + "experience_type": "프로젝트", + "basic_info": { + "project_name": "캡스톤 프로젝트", + "unknown_field": "허용되지 않음", + }, + }) + + def test_dynamic_model_rejects_mismatched_group(self): + selected = ExperienceSummary( + experience_name="토익", + experience_group="스펙·증빙형", + experience_type="어학", + ) + preset = ExperiencePresetSchema( + experience_group="상세 서술형", + experience_type="LANGUAGE", + experience_type_name="어학", + fields=[], + ) + + with self.assertRaises(ValueError): + build_dynamic_step2_model(selected, preset) + + +if __name__ == "__main__": + unittest.main() diff --git a/myeongsung/tests/test_experience_step2_v2_api.py b/myeongsung/tests/test_experience_step2_v2_api.py new file mode 100644 index 0000000..71109a8 --- /dev/null +++ b/myeongsung/tests/test_experience_step2_v2_api.py @@ -0,0 +1,181 @@ +import json +import subprocess +import sys +import unittest +from unittest.mock import patch + + +def _fastapi_runtime_available() -> bool: + try: + completed = subprocess.run( + [ + sys.executable, + "-c", + "import unicodedata, fastapi, httpx", + ], + check=False, + capture_output=True, + timeout=10, + ) + return completed.returncode == 0 + except subprocess.TimeoutExpired: + return False + + +@unittest.skipUnless( + _fastapi_runtime_available(), + "현재 Python 런타임에서 FastAPI 네이티브 의존성을 로드할 수 없습니다.", +) +class ExperienceStep2V2ApiTest(unittest.TestCase): + + def setUp(self): + from fastapi import FastAPI + from fastapi.testclient import TestClient + from app.api.experience_extraction_v2 import router + + app = FastAPI() + app.include_router(router, prefix="/api/v1") + self.client = TestClient(app) + + def test_step2_v2_parses_spring_multipart_contract_and_returns_merge_shape(self): + from app.schemas.resume_dto import Step2V2ExtractionResponse + + captured = {} + + def fake_extract(url, selected, presets): + captured["url"] = url + captured["selected"] = selected + captured["presets"] = presets + return Step2V2ExtractionResponse(experiences=[{ + "experience_name": "캡스톤 프로젝트", + "experience_group": "상세 서술형", + "experience_type": "프로젝트", + "keywords": ["실행력"], + "basic_info": { + "project_name": "캡스톤 프로젝트", + "period": "2026.01 ~ 2026.06", + }, + "experience_content": "추천 모델을 개발했습니다.", + }]) + + def fake_merge(experiences, existing): + captured["existing"] = existing + experiences[0]["needs_merge"] = True + experiences[0]["merge_candidate_id"] = "existing-1" + experiences[0]["merge_similarity"] = 0.92 + return experiences + + with patch( + "app.api.experience_extraction_v2.extract_step2_v2_from_url", + side_effect=fake_extract, + ), patch( + "app.api.experience_extraction_v2.apply_sequential_merge_results_to_step2", + side_effect=fake_merge, + ): + response = self.client.post( + "/api/v1/extract-experiences/step2-v2", + data={ + "url": "https://cdn.example.com/resume.pdf", + "selected_experiences": json.dumps([{ + "experience_name": "캡스톤 프로젝트", + "experience_group": "상세 서술형", + "experience_type": "프로젝트", + }], ensure_ascii=False), + "existing_experiences": json.dumps([{ + "id": "existing-1", + "title": "기존 프로젝트", + "experience_group": "상세 서술형", + "experience_type": "프로젝트", + }], ensure_ascii=False), + "preset_schemas": json.dumps([{ + "experience_group": "상세 서술형", + "experience_type": "PROJECT", + "experience_type_name": "프로젝트", + "fields": [ + {"key": "project_name", "label": "프로젝트명"}, + {"key": "period", "label": "진행 기간"}, + ], + }], ensure_ascii=False), + }, + ) + + self.assertEqual(200, response.status_code) + self.assertEqual( + "https://cdn.example.com/resume.pdf", + captured["url"], + ) + self.assertEqual( + "캡스톤 프로젝트", + captured["selected"][0].experience_name, + ) + self.assertEqual( + "PROJECT", + captured["presets"][0].experience_type, + ) + self.assertEqual("existing-1", captured["existing"][0].id) + body = response.json()["experiences"][0] + self.assertTrue(body["needs_merge"]) + self.assertEqual("existing-1", body["merge_candidate_id"]) + self.assertEqual(0.92, body["merge_similarity"]) + self.assertEqual( + "캡스톤 프로젝트", + body["basic_info"]["project_name"], + ) + + def test_step2_v2_rejects_invalid_preset_json(self): + response = self.client.post( + "/api/v1/extract-experiences/step2-v2", + data={ + "url": "https://cdn.example.com/resume.pdf", + "selected_experiences": "[]", + "existing_experiences": "[]", + "preset_schemas": "{invalid-json", + }, + ) + + self.assertEqual(400, response.status_code) + self.assertIn( + "Step2 V2 요청 JSON 파싱 오류", + response.json()["detail"], + ) + + def test_step2_v2_returns_400_when_selected_experience_and_preset_mismatch(self): + from app.services.experience_preset_service import ( + build_dynamic_step2_model, + ) + + def mismatched_extract(url, selected, presets): + build_dynamic_step2_model(selected[0], presets[0]) + + with patch( + "app.api.experience_extraction_v2.extract_step2_v2_from_url", + side_effect=mismatched_extract, + ): + response = self.client.post( + "/api/v1/extract-experiences/step2-v2", + data={ + "url": "https://cdn.example.com/resume.pdf", + "selected_experiences": json.dumps([{ + "experience_name": "토익", + "experience_group": "스펙·증빙형", + "experience_type": "어학", + }], ensure_ascii=False), + "existing_experiences": "[]", + "preset_schemas": json.dumps([{ + "experience_group": "상세 서술형", + "experience_type": "LANGUAGE", + "experience_type_name": "어학", + "fields": [], + }], ensure_ascii=False), + }, + ) + + self.assertEqual(400, response.status_code) + self.assertEqual( + "선택 경험과 프리셋의 경험 그룹이 일치하지 않습니다.", + response.json()["detail"], + ) + + +if __name__ == "__main__": + unittest.main()