diff --git a/components/Datasets/DatasetLongDescriptionSuggestFromFiles.vue b/components/Datasets/DatasetLongDescriptionSuggestFromFiles.vue new file mode 100644 index 000000000..9a2b94653 --- /dev/null +++ b/components/Datasets/DatasetLongDescriptionSuggestFromFiles.vue @@ -0,0 +1,138 @@ + + + diff --git a/components/Datasets/New/Step3AddResources.vue b/components/Datasets/New/Step3AddResources.vue index 76c96a712..85323aaea 100644 --- a/components/Datasets/New/Step3AddResources.vue +++ b/components/Datasets/New/Step3AddResources.vue @@ -137,6 +137,13 @@ + import { BrandedButton, PaddedContainer, SimpleBanner } from '@datagouv/components-next' +import DatasetLongDescriptionSuggestFromFiles from '../DatasetLongDescriptionSuggestFromFiles.vue' import UploadResourceModal from '../UploadResourceModal.vue' import type { DatasetForm, ResourceForm } from '~/types/types' +const datasetForm = defineModel('datasetForm', { required: true }) + const props = defineProps<{ loading: boolean resources: Array - datasetForm: DatasetForm }>() const emit = defineEmits<{ @@ -200,7 +209,7 @@ const { t } = useTranslation() const publishFileAccordionId = useId() const addDescriptionAccordionId = useId() -const isDatasetOpen = computed(() => props.datasetForm.access_type === 'open') +const isDatasetOpen = computed(() => datasetForm.value.access_type === 'open') const { form, getFirstError, getFirstWarning, touch, validate, errorsAsList: errors } = useForm({ resources: props.resources, diff --git a/components/Datasets/Structured/Step3DescribeDataset.vue b/components/Datasets/Structured/Step3DescribeDataset.vue index 0f7f4058d..c5c6b2c5c 100644 --- a/components/Datasets/Structured/Step3DescribeDataset.vue +++ b/components/Datasets/Structured/Step3DescribeDataset.vue @@ -107,6 +107,13 @@ :placeholder="$t('Décrivez le contenu, la source et l\'utilité de ces données...')" :rows="10" /> + +}>(), { + resources: () => [], +}) const emit = defineEmits<{ (e: 'previous' | 'next'): void diff --git a/pages/admin/datasets/new.vue b/pages/admin/datasets/new.vue index bd7365ed2..61c895448 100644 --- a/pages/admin/datasets/new.vue +++ b/pages/admin/datasets/new.vue @@ -33,8 +33,8 @@ /> diff --git a/server/routes/nuxt-api/albert/generate-dataset-long-description.post.ts b/server/routes/nuxt-api/albert/generate-dataset-long-description.post.ts new file mode 100644 index 000000000..f97f64039 --- /dev/null +++ b/server/routes/nuxt-api/albert/generate-dataset-long-description.post.ts @@ -0,0 +1,89 @@ +import { DESCRIPTION_MIN_LENGTH } from '~/datagouv-components/src/functions/description' +import { callAlbertAPI } from './utils/albert-helpers' +import { + MAX_COMBINED_EXCERPT_CHARS, + MIN_COMBINED_EXCERPT_CHARS, +} from '~/utils/read-dataset-file-excerpt' + +const MAXIMUM_PROMPT_LENGTH = 120_000 + +export default defineEventHandler(async (event) => { + const body = await readBody(event) + const { title, organization, fileExcerpt } = body + + if (!title?.trim()) { + throw createError({ + statusCode: 400, + statusMessage: 'Title is required', + }) + } + + const excerpt = typeof fileExcerpt === 'string' ? fileExcerpt.trim() : '' + if (!excerpt || excerpt.length < MIN_COMBINED_EXCERPT_CHARS) { + throw createError({ + statusCode: 400, + statusMessage: 'File excerpt is required and must contain enough text', + }) + } + + if (excerpt.length > MAX_COMBINED_EXCERPT_CHARS) { + throw createError({ + statusCode: 400, + statusMessage: 'File excerpt exceeds maximum length', + }) + } + + const systemContent = `You are an assistant integrated into data.gouv.fr, the French open data platform. +Your purpose is to help data producers write clear, comprehensive, and factual long descriptions of datasets. + +Guidelines: +- Always respond in French. +- Your tone is factual, neutral, and accessible to non-experts. +- Use plain language and clear sentences; use Markdown when it helps structure (headings ##, bullet lists). +- Do not make assumptions or add information that is not clearly supported by the excerpt or title. +- Cover content, structure, and limits of the data when the excerpt allows it (variables, scope, updates, methodology if present). +- Always start the main text with a capital letter. +- IMPORTANT: Return ONLY the description text (Markdown allowed), without a preamble or labels such as "Description:".` + + const userContent = `You are asked to generate a long description for a dataset on data.gouv.fr. + +Goal: +→ Write a detailed, reusable description that helps people understand what the dataset contains and how to use it. +→ Reflect only what can be inferred from the excerpt below (column names, codes, dates, geography, etc. when visible). +→ If the excerpt is only a data sample, describe the likely subject matter and structure without inventing methodology or sources not shown. + +Dataset title: ${title.trim()} +${organization ? `Producer organization: ${organization}\n` : ''} +Excerpt from uploaded file(s): + +${excerpt} + +Output: +→ A description in French with Markdown allowed (e.g. ## sections, lists). +→ Minimum length: at least ${DESCRIPTION_MIN_LENGTH} characters. +→ No generic filler about "open data" unless the excerpt supports it.` + + const totalLength = systemContent.length + userContent.length + if (totalLength > MAXIMUM_PROMPT_LENGTH) { + throw createError({ + statusCode: 422, + statusMessage: `The excerpt is too long to process (${totalLength} characters, maximum ${MAXIMUM_PROMPT_LENGTH}).`, + }) + } + + const messages = [ + { role: 'system', content: systemContent }, + { role: 'user', content: userContent }, + ] + + const generatedDescription = (await callAlbertAPI(messages, 'openweight-small')).trim() + + if (generatedDescription.length < DESCRIPTION_MIN_LENGTH) { + throw createError({ + statusCode: 422, + statusMessage: 'The model could not generate a sufficient description. Try a more descriptive text or CSV with headers.', + }) + } + + return { description: generatedDescription } +}) diff --git a/utils/read-dataset-file-excerpt.ts b/utils/read-dataset-file-excerpt.ts new file mode 100644 index 000000000..763041f1c --- /dev/null +++ b/utils/read-dataset-file-excerpt.ts @@ -0,0 +1,104 @@ +import type { ResourceForm } from '~/types/types' + +const MAX_BYTES_PER_FILE = 800_000 + +/** Maximum characters sent to the suggestion API (combined across files). */ +export const MAX_COMBINED_EXCERPT_CHARS = 100_000 + +/** Minimum useful content length for the model input. */ +export const MIN_COMBINED_EXCERPT_CHARS = 50 + +const TEXT_EXTENSIONS = new Set([ + 'csv', + 'tsv', + 'txt', + 'md', + 'json', + 'jsonl', + 'xml', + 'yaml', + 'yml', + 'geojson', + 'html', + 'htm', +]) + +function fileExtension(name: string): string { + const dot = name.lastIndexOf('.') + return dot >= 0 ? name.slice(dot + 1).toLowerCase() : '' +} + +function isProbablyTextFile(file: File): boolean { + const mime = file.type.toLowerCase() + if (mime.startsWith('text/')) { + return true + } + if (mime === 'application/json' || mime === 'application/xml' || mime === 'application/geo+json') { + return true + } + return TEXT_EXTENSIONS.has(fileExtension(file.name)) +} + +/** + * Reads a bounded UTF-8 excerpt from a local file for dataset description suggestion. + */ +export async function readDatasetFileExcerptForDescription(file: File): Promise { + if (!isProbablyTextFile(file)) { + return null + } + const byteLength = Math.min(MAX_BYTES_PER_FILE, file.size) + if (byteLength === 0) { + return null + } + const slice = file.slice(0, byteLength) + const buffer = await slice.arrayBuffer() + const text = new TextDecoder('utf-8', { fatal: false }).decode(buffer) + if (text.includes('\0')) { + return null + } + const trimmed = text.trim() + if (trimmed.length < MIN_COMBINED_EXCERPT_CHARS) { + return null + } + return trimmed.length > MAX_COMBINED_EXCERPT_CHARS + ? trimmed.slice(0, MAX_COMBINED_EXCERPT_CHARS) + : trimmed +} + +/** + * Builds one prompt excerpt from readable local files in order. + */ +export async function buildCombinedExcerptFromResourceForms(resources: Array): Promise { + const chunks: string[] = [] + let total = 0 + + for (const resource of resources) { + if (resource.filetype !== 'file' || !resource.file?.raw) { + continue + } + const excerpt = await readDatasetFileExcerptForDescription(resource.file.raw) + if (!excerpt) { + continue + } + const label = resource.title?.trim() || resource.file.raw.name + const header = `--- ${label} ---\n` + const remaining = MAX_COMBINED_EXCERPT_CHARS - total + if (remaining <= header.length + MIN_COMBINED_EXCERPT_CHARS) { + break + } + const maxBody = remaining - header.length + const body = excerpt.length > maxBody ? excerpt.slice(0, maxBody) : excerpt + const piece = `${header}${body}` + chunks.push(piece) + total += piece.length + if (total >= MAX_COMBINED_EXCERPT_CHARS) { + break + } + } + + if (!chunks.length) { + return null + } + const combined = chunks.join('\n') + return combined.length >= MIN_COMBINED_EXCERPT_CHARS ? combined : null +}