diff --git a/README.md b/README.md index 090f6d4..361d92b 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,8 @@ Voila un sample des données parsées [dvf/sample.json](dvf/sample.json) https://files.data.gouv.fr/geo-dvf/latest/csv/ mkdir -p geodvf + curl https://files.data.gouv.fr/geo-dvf/latest/csv/2025/full.csv.gz -o geodvf/2025.csv.gz + curl https://files.data.gouv.fr/geo-dvf/latest/csv/2024/full.csv.gz -o geodvf/2024.csv.gz curl https://files.data.gouv.fr/geo-dvf/latest/csv/2023/full.csv.gz -o geodvf/2023.csv.gz curl https://files.data.gouv.fr/geo-dvf/latest/csv/2022/full.csv.gz -o geodvf/2022.csv.gz curl https://files.data.gouv.fr/geo-dvf/latest/csv/2021/full.csv.gz -o geodvf/2021.csv.gz @@ -30,13 +32,19 @@ https://files.data.gouv.fr/geo-dvf/latest/csv/ Voila un sample des données parsées [geodvf/sample.json](geodvf/sample.json) -# run +# sync (automatique) -Cette commande ouvre les fichiers csv et chie des inserts mysql en batch +Détecte les années manquantes en base, télécharge les csv.gz si besoin, parse et insère directement. - echo "MYSQL=mysql://user:password@host/database?charset=utf8mb4&connectionLimit=10" > .env - node parse.js geodvf/2022.csv.gz | gzip > geodvf/2022.sql.gz - pv geodvf/2023.sql.gz | gunzip | mysql -u user -ppassword -h host database + node sync.js # importe les années manquantes + node sync.js 2025 # force le re-import d'une année + +# parse (manuel) + +Génère du SQL sur stdout à partir d'un csv.gz (ancien workflow). + + node parse.js geodvf/2025.csv.gz | gzip > geodvf/2025.sql.gz + pv geodvf/2025.sql.gz | gunzip | mysql -u user -ppassword -h host database ``` CREATE TABLE IF NOT EXISTS dvf ( diff --git a/lib/dotenv.js b/lib/dotenv.js new file mode 100644 index 0000000..d60daa0 --- /dev/null +++ b/lib/dotenv.js @@ -0,0 +1,6 @@ +import { findUpSync } from 'find-up'; +import dotenv from 'dotenv'; +import path from 'node:path'; + +const foundenv = findUpSync('.env', { cwd: path.dirname(process.argv[1]) }); +dotenv.config({ path: foundenv, quiet: true }); diff --git a/package-lock.json b/package-lock.json index ac54d0f..b9023c8 100644 --- a/package-lock.json +++ b/package-lock.json @@ -11,6 +11,7 @@ "dependencies": { "csv-parse": "^6.2.1", "dotenv": "^17.3.1", + "find-up": "^8.0.0", "mysql2": "^3.20.0" }, "devDependencies": { @@ -2057,6 +2058,97 @@ "node": ">=4.0" } }, + "node_modules/eslint/node_modules/find-up": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/find-up/-/find-up-5.0.0.tgz", + "integrity": "sha512-78/PXT1wlLLDgTzDs7sjq9hzz0vXD+zn+7wypEe4fXQxCmdmqfGsEPQxmiCSQI3ajFV91bVSsvNtrJRiW6nGng==", + "dev": true, + "license": "MIT", + "dependencies": { + "locate-path": "^6.0.0", + "path-exists": "^4.0.0" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/eslint/node_modules/locate-path": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/locate-path/-/locate-path-6.0.0.tgz", + "integrity": "sha512-iPZK6eYjbxRu3uB4/WZ3EsEIMJFMqAoopl3R+zuq0UjcAm/MO6KCweDgPfP3elTztoKP3KtnVHxTn2NHBSDVUw==", + "dev": true, + "license": "MIT", + "dependencies": { + "p-locate": "^5.0.0" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/eslint/node_modules/minimatch": { + "version": "3.1.5", + "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.5.tgz", + "integrity": "sha512-VgjWUsnnT6n+NUk6eZq77zeFdpW2LWDzP6zFGrCbHXiYNul5Dzqk2HHQ5uFH2DNW5Xbp8+jVzaeNt94ssEEl4w==", + "dev": true, + "license": "ISC", + "dependencies": { + "brace-expansion": "^1.1.7" + }, + "engines": { + "node": "*" + } + }, + "node_modules/eslint/node_modules/p-limit": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-3.1.0.tgz", + "integrity": "sha512-TYOanM3wGwNGsZN2cVTYPArw454xnXj5qmWF1bEoAc4+cU/ol7GVh7odevjp1FNHduHc3KZMcFduxU5Xc6uJRQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "yocto-queue": "^0.1.0" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/eslint/node_modules/p-locate": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/p-locate/-/p-locate-5.0.0.tgz", + "integrity": "sha512-LaNjtRWUBY++zB5nE/NwcaoMylSPk+S+ZHNB1TzdbMJMny6dynpAGt7X/tl/QYq3TIeE6nxHppbo2LGymrG5Pw==", + "dev": true, + "license": "MIT", + "dependencies": { + "p-limit": "^3.0.2" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/eslint/node_modules/yocto-queue": { + "version": "0.1.0", + "resolved": "https://registry.npmjs.org/yocto-queue/-/yocto-queue-0.1.0.tgz", + "integrity": "sha512-rVksvsnNCdJ/ohGc6xgPwyN8eheCxsiLM8mxuE/t/mOVqJewPuO1miLpTHQiRgTKCLexL4MeAFVagts7HmNZ2Q==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/espree": { "version": "10.4.0", "resolved": "https://registry.npmjs.org/espree/-/espree-10.4.0.tgz", @@ -2196,17 +2288,16 @@ } }, "node_modules/find-up": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/find-up/-/find-up-5.0.0.tgz", - "integrity": "sha512-78/PXT1wlLLDgTzDs7sjq9hzz0vXD+zn+7wypEe4fXQxCmdmqfGsEPQxmiCSQI3ajFV91bVSsvNtrJRiW6nGng==", - "dev": true, + "version": "8.0.0", + "resolved": "https://registry.npmjs.org/find-up/-/find-up-8.0.0.tgz", + "integrity": "sha512-JGG8pvDi2C+JxidYdIwQDyS/CgcrIdh18cvgxcBge3wSHRQOrooMD3GlFBcmMJAN9M42SAZjDp5zv1dglJjwww==", "license": "MIT", "dependencies": { - "locate-path": "^6.0.0", - "path-exists": "^4.0.0" + "locate-path": "^8.0.0", + "unicorn-magic": "^0.3.0" }, "engines": { - "node": ">=10" + "node": ">=20" }, "funding": { "url": "https://github.com/sponsors/sindresorhus" @@ -3274,16 +3365,15 @@ } }, "node_modules/locate-path": { - "version": "6.0.0", - "resolved": "https://registry.npmjs.org/locate-path/-/locate-path-6.0.0.tgz", - "integrity": "sha512-iPZK6eYjbxRu3uB4/WZ3EsEIMJFMqAoopl3R+zuq0UjcAm/MO6KCweDgPfP3elTztoKP3KtnVHxTn2NHBSDVUw==", - "dev": true, + "version": "8.0.0", + "resolved": "https://registry.npmjs.org/locate-path/-/locate-path-8.0.0.tgz", + "integrity": "sha512-XT9ewWAC43tiAV7xDAPflMkG0qOPn2QjHqlgX8FOqmWa/rxnyYDulF9T0F7tRy1u+TVTmK/M//6VIOye+2zDXg==", "license": "MIT", "dependencies": { - "p-locate": "^5.0.0" + "p-locate": "^6.0.0" }, "engines": { - "node": ">=10" + "node": ">=20" }, "funding": { "url": "https://github.com/sponsors/sindresorhus" @@ -3628,32 +3718,30 @@ } }, "node_modules/p-limit": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-3.1.0.tgz", - "integrity": "sha512-TYOanM3wGwNGsZN2cVTYPArw454xnXj5qmWF1bEoAc4+cU/ol7GVh7odevjp1FNHduHc3KZMcFduxU5Xc6uJRQ==", - "dev": true, + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-4.0.0.tgz", + "integrity": "sha512-5b0R4txpzjPWVw/cXXUResoD4hb6U/x9BH08L7nw+GN1sezDzPdxeRvpc9c433fZhBan/wusjbCsqwqm4EIBIQ==", "license": "MIT", "dependencies": { - "yocto-queue": "^0.1.0" + "yocto-queue": "^1.0.0" }, "engines": { - "node": ">=10" + "node": "^12.20.0 || ^14.13.1 || >=16.0.0" }, "funding": { "url": "https://github.com/sponsors/sindresorhus" } }, "node_modules/p-locate": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/p-locate/-/p-locate-5.0.0.tgz", - "integrity": "sha512-LaNjtRWUBY++zB5nE/NwcaoMylSPk+S+ZHNB1TzdbMJMny6dynpAGt7X/tl/QYq3TIeE6nxHppbo2LGymrG5Pw==", - "dev": true, + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/p-locate/-/p-locate-6.0.0.tgz", + "integrity": "sha512-wPrq66Llhl7/4AGC6I+cqxT07LhXvWL08LNXz1fENOw0Ap4sRZZ/gZpTTJ5jpurzzzfS2W/Ge9BY3LgLjCShcw==", "license": "MIT", "dependencies": { - "p-limit": "^3.0.2" + "p-limit": "^4.0.0" }, "engines": { - "node": ">=10" + "node": "^12.20.0 || ^14.13.1 || >=16.0.0" }, "funding": { "url": "https://github.com/sponsors/sindresorhus" @@ -4621,6 +4709,18 @@ "license": "MIT", "peer": true }, + "node_modules/unicorn-magic": { + "version": "0.3.0", + "resolved": "https://registry.npmjs.org/unicorn-magic/-/unicorn-magic-0.3.0.tgz", + "integrity": "sha512-+QBBXBCvifc56fsbuxZQ6Sic3wqqc3WWaqxs58gvJrcOuN83HGTCwz3oS5phzU9LthRNE9VrJCFCLUgHeeFnfA==", + "license": "MIT", + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/update-browserslist-db": { "version": "1.2.3", "resolved": "https://registry.npmjs.org/update-browserslist-db/-/update-browserslist-db-1.2.3.tgz", @@ -4850,13 +4950,12 @@ } }, "node_modules/yocto-queue": { - "version": "0.1.0", - "resolved": "https://registry.npmjs.org/yocto-queue/-/yocto-queue-0.1.0.tgz", - "integrity": "sha512-rVksvsnNCdJ/ohGc6xgPwyN8eheCxsiLM8mxuE/t/mOVqJewPuO1miLpTHQiRgTKCLexL4MeAFVagts7HmNZ2Q==", - "dev": true, + "version": "1.2.2", + "resolved": "https://registry.npmjs.org/yocto-queue/-/yocto-queue-1.2.2.tgz", + "integrity": "sha512-4LCcse/U2MHZ63HAJVE+v71o7yOdIe4cZ70Wpf8D/IyjDKYQLV5GD46B+hSTjJsvV5PztjvHoU580EftxjDZFQ==", "license": "MIT", "engines": { - "node": ">=10" + "node": ">=12.20" }, "funding": { "url": "https://github.com/sponsors/sindresorhus" diff --git a/package.json b/package.json index 74cd4b7..77dbd80 100644 --- a/package.json +++ b/package.json @@ -23,6 +23,7 @@ "dependencies": { "csv-parse": "^6.2.1", "dotenv": "^17.3.1", + "find-up": "^8.0.0", "mysql2": "^3.20.0" }, "devDependencies": { diff --git a/sync.js b/sync.js new file mode 100644 index 0000000..a5546cd --- /dev/null +++ b/sync.js @@ -0,0 +1,157 @@ +import { parse } from 'csv-parse'; +import fs from 'node:fs'; +import zlib from 'node:zlib'; +import { pipeline } from 'node:stream/promises'; +import { Writable } from 'node:stream'; +import mysql from 'mysql2/promise'; +import './lib/dotenv.js'; + +const BASE_URL = 'https://files.data.gouv.fr/geo-dvf/latest/csv'; +const GEODVF_DIR = 'geodvf'; + +const connectionString = process.env.MYSQL; +if (!connectionString) { + throw new Error('MYSQL environment variable not set'); +} + +// parse mysql connection string +const url = new URL(connectionString); +const dbConfig = { + host: url.hostname, + port: url.port || 3306, + user: url.username, + password: decodeURIComponent(url.password), + database: url.pathname.slice(1), + charset: 'utf8mb4', +}; + +async function getYearsOnServer() { + const res = await fetch(`${BASE_URL}/`); + const html = await res.text(); + const years = []; + for (const match of html.matchAll(/href="(\d{4})\/"/g)) { + years.push(Number(match[1])); + } + return years.toSorted(); +} + +async function getYearsInDb(connection) { + const [rows] = await connection.query('SELECT DISTINCT YEAR(date_mutation) as annee FROM dvf ORDER BY annee'); + return rows.map(r => r.annee); +} + +async function downloadYear(year) { + const file = `${GEODVF_DIR}/${year}.csv.gz`; + if (fs.existsSync(file)) { + console.log(` ${file} déjà présent, skip download`); + return file; + } + const url = `${BASE_URL}/${year}/full.csv.gz`; + console.log(` Téléchargement ${url} ...`); + const res = await fetch(url); + if (!res.ok) throw new Error(`HTTP ${res.status} pour ${url}`); + fs.mkdirSync(GEODVF_DIR, { recursive: true }); + const dest = fs.createWriteStream(file); + await pipeline(res.body, dest); + const size = fs.statSync(file).size; + console.log(` Téléchargé ${file} (${(size / 1024 / 1024).toFixed(1)} Mo)`); + return file; +} + +async function importYear(connection, year, file) { + console.log(` Parsing et insertion de ${file} ...`); + + let columns = null; + let inserted = 0; + const BATCH_SIZE = 5000; + let batch = []; + + const flush = async () => { + if (batch.length === 0) return; + const placeholders = batch.map(row => `(${row.map(() => '?').join(', ')})`).join(', '); + const sql = `INSERT INTO dvf (${columns.join(', ')}) VALUES ${placeholders}`; + const flat = batch.flat(); + await connection.query(sql, flat); + inserted += batch.length; + if (inserted % 100_000 < BATCH_SIZE) { + process.stdout.write(` ... ${inserted.toLocaleString()} lignes insérées\r`); + } + batch = []; + }; + + const parser = parse({ delimiter: ',', columns: true }); + + const writer = new Writable({ + objectMode: true, + async write(record, _encoding, callback) { + try { + if (!columns) { + columns = Object.keys(record).map(col => `\`${col}\``); + } + const values = Object.values(record).map(v => v === '' ? null : v); + batch.push(values); + if (batch.length >= BATCH_SIZE) { + await flush(); + } + callback(); + } catch (err) { + callback(err); + } + }, + async final(callback) { + try { + await flush(); + callback(); + } catch (err) { + callback(err); + } + }, + }); + + const input = fs.createReadStream(file).pipe(zlib.createGunzip()); + await pipeline(input, parser, writer); + + console.log(` ${inserted.toLocaleString()} lignes insérées pour ${year}`); + return inserted; +} + +async function main() { + const forceYear = process.argv[2] ? Number(process.argv[2]) : null; + + console.log('Connexion à la base...'); + const connection = await mysql.createConnection(dbConfig); + + console.log('Vérification des années sur le serveur...'); + const serverYears = await getYearsOnServer(); + console.log(`Années disponibles : ${serverYears.join(', ')}`); + + const dbYears = await getYearsInDb(connection); + console.log(`Années en base : ${dbYears.join(', ')}`); + + let missing; + if (forceYear) { + missing = [forceYear]; + console.log(`Mode forcé : import de ${forceYear}`); + } else { + missing = serverYears.filter(y => !dbYears.includes(y)); + } + + if (missing.length === 0) { + console.log('Tout est à jour !'); + await connection.end(); + return; + } + + console.log(`Années manquantes : ${missing.join(', ')}`); + + for (const year of missing) { + console.log(`\n=== ${year} ===`); + const file = await downloadYear(year); + await importYear(connection, year, file); + } + + console.log('\nTerminé !'); + await connection.end(); +} + +await main();