feature: sync.js - script automatique de détection, téléchargement et import des années manquantes geodvf

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-03-29 04:10:24 +02:00
parent e0fc56473c
commit 65de913c1d
5 changed files with 306 additions and 35 deletions

View File

@@ -22,6 +22,8 @@ Voila un sample des données parsées [dvf/sample.json](dvf/sample.json)
https://files.data.gouv.fr/geo-dvf/latest/csv/
mkdir -p geodvf
curl https://files.data.gouv.fr/geo-dvf/latest/csv/2025/full.csv.gz -o geodvf/2025.csv.gz
curl https://files.data.gouv.fr/geo-dvf/latest/csv/2024/full.csv.gz -o geodvf/2024.csv.gz
curl https://files.data.gouv.fr/geo-dvf/latest/csv/2023/full.csv.gz -o geodvf/2023.csv.gz
curl https://files.data.gouv.fr/geo-dvf/latest/csv/2022/full.csv.gz -o geodvf/2022.csv.gz
curl https://files.data.gouv.fr/geo-dvf/latest/csv/2021/full.csv.gz -o geodvf/2021.csv.gz
@@ -30,13 +32,19 @@ https://files.data.gouv.fr/geo-dvf/latest/csv/
Voila un sample des données parsées [geodvf/sample.json](geodvf/sample.json)
# run
# sync (automatique)
Cette commande ouvre les fichiers csv et chie des inserts mysql en batch
Détecte les années manquantes en base, télécharge les csv.gz si besoin, parse et insère directement.
echo "MYSQL=mysql://user:password@host/database?charset=utf8mb4&connectionLimit=10" > .env
node parse.js geodvf/2022.csv.gz | gzip > geodvf/2022.sql.gz
pv geodvf/2023.sql.gz | gunzip | mysql -u user -ppassword -h host database
node sync.js # importe les années manquantes
node sync.js 2025 # force le re-import d'une année
# parse (manuel)
Génère du SQL sur stdout à partir d'un csv.gz (ancien workflow).
node parse.js geodvf/2025.csv.gz | gzip > geodvf/2025.sql.gz
pv geodvf/2025.sql.gz | gunzip | mysql -u user -ppassword -h host database
```
CREATE TABLE IF NOT EXISTS dvf (

6
lib/dotenv.js Normal file
View File

@@ -0,0 +1,6 @@
import { findUpSync } from 'find-up';
import dotenv from 'dotenv';
import path from 'node:path';
const foundenv = findUpSync('.env', { cwd: path.dirname(process.argv[1]) });
dotenv.config({ path: foundenv, quiet: true });

159
package-lock.json generated
View File

@@ -11,6 +11,7 @@
"dependencies": {
"csv-parse": "^6.2.1",
"dotenv": "^17.3.1",
"find-up": "^8.0.0",
"mysql2": "^3.20.0"
},
"devDependencies": {
@@ -2057,6 +2058,97 @@
"node": ">=4.0"
}
},
"node_modules/eslint/node_modules/find-up": {
"version": "5.0.0",
"resolved": "https://registry.npmjs.org/find-up/-/find-up-5.0.0.tgz",
"integrity": "sha512-78/PXT1wlLLDgTzDs7sjq9hzz0vXD+zn+7wypEe4fXQxCmdmqfGsEPQxmiCSQI3ajFV91bVSsvNtrJRiW6nGng==",
"dev": true,
"license": "MIT",
"dependencies": {
"locate-path": "^6.0.0",
"path-exists": "^4.0.0"
},
"engines": {
"node": ">=10"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/eslint/node_modules/locate-path": {
"version": "6.0.0",
"resolved": "https://registry.npmjs.org/locate-path/-/locate-path-6.0.0.tgz",
"integrity": "sha512-iPZK6eYjbxRu3uB4/WZ3EsEIMJFMqAoopl3R+zuq0UjcAm/MO6KCweDgPfP3elTztoKP3KtnVHxTn2NHBSDVUw==",
"dev": true,
"license": "MIT",
"dependencies": {
"p-locate": "^5.0.0"
},
"engines": {
"node": ">=10"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/eslint/node_modules/minimatch": {
"version": "3.1.5",
"resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.5.tgz",
"integrity": "sha512-VgjWUsnnT6n+NUk6eZq77zeFdpW2LWDzP6zFGrCbHXiYNul5Dzqk2HHQ5uFH2DNW5Xbp8+jVzaeNt94ssEEl4w==",
"dev": true,
"license": "ISC",
"dependencies": {
"brace-expansion": "^1.1.7"
},
"engines": {
"node": "*"
}
},
"node_modules/eslint/node_modules/p-limit": {
"version": "3.1.0",
"resolved": "https://registry.npmjs.org/p-limit/-/p-limit-3.1.0.tgz",
"integrity": "sha512-TYOanM3wGwNGsZN2cVTYPArw454xnXj5qmWF1bEoAc4+cU/ol7GVh7odevjp1FNHduHc3KZMcFduxU5Xc6uJRQ==",
"dev": true,
"license": "MIT",
"dependencies": {
"yocto-queue": "^0.1.0"
},
"engines": {
"node": ">=10"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/eslint/node_modules/p-locate": {
"version": "5.0.0",
"resolved": "https://registry.npmjs.org/p-locate/-/p-locate-5.0.0.tgz",
"integrity": "sha512-LaNjtRWUBY++zB5nE/NwcaoMylSPk+S+ZHNB1TzdbMJMny6dynpAGt7X/tl/QYq3TIeE6nxHppbo2LGymrG5Pw==",
"dev": true,
"license": "MIT",
"dependencies": {
"p-limit": "^3.0.2"
},
"engines": {
"node": ">=10"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/eslint/node_modules/yocto-queue": {
"version": "0.1.0",
"resolved": "https://registry.npmjs.org/yocto-queue/-/yocto-queue-0.1.0.tgz",
"integrity": "sha512-rVksvsnNCdJ/ohGc6xgPwyN8eheCxsiLM8mxuE/t/mOVqJewPuO1miLpTHQiRgTKCLexL4MeAFVagts7HmNZ2Q==",
"dev": true,
"license": "MIT",
"engines": {
"node": ">=10"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/espree": {
"version": "10.4.0",
"resolved": "https://registry.npmjs.org/espree/-/espree-10.4.0.tgz",
@@ -2196,17 +2288,16 @@
}
},
"node_modules/find-up": {
"version": "5.0.0",
"resolved": "https://registry.npmjs.org/find-up/-/find-up-5.0.0.tgz",
"integrity": "sha512-78/PXT1wlLLDgTzDs7sjq9hzz0vXD+zn+7wypEe4fXQxCmdmqfGsEPQxmiCSQI3ajFV91bVSsvNtrJRiW6nGng==",
"dev": true,
"version": "8.0.0",
"resolved": "https://registry.npmjs.org/find-up/-/find-up-8.0.0.tgz",
"integrity": "sha512-JGG8pvDi2C+JxidYdIwQDyS/CgcrIdh18cvgxcBge3wSHRQOrooMD3GlFBcmMJAN9M42SAZjDp5zv1dglJjwww==",
"license": "MIT",
"dependencies": {
"locate-path": "^6.0.0",
"path-exists": "^4.0.0"
"locate-path": "^8.0.0",
"unicorn-magic": "^0.3.0"
},
"engines": {
"node": ">=10"
"node": ">=20"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
@@ -3274,16 +3365,15 @@
}
},
"node_modules/locate-path": {
"version": "6.0.0",
"resolved": "https://registry.npmjs.org/locate-path/-/locate-path-6.0.0.tgz",
"integrity": "sha512-iPZK6eYjbxRu3uB4/WZ3EsEIMJFMqAoopl3R+zuq0UjcAm/MO6KCweDgPfP3elTztoKP3KtnVHxTn2NHBSDVUw==",
"dev": true,
"version": "8.0.0",
"resolved": "https://registry.npmjs.org/locate-path/-/locate-path-8.0.0.tgz",
"integrity": "sha512-XT9ewWAC43tiAV7xDAPflMkG0qOPn2QjHqlgX8FOqmWa/rxnyYDulF9T0F7tRy1u+TVTmK/M//6VIOye+2zDXg==",
"license": "MIT",
"dependencies": {
"p-locate": "^5.0.0"
"p-locate": "^6.0.0"
},
"engines": {
"node": ">=10"
"node": ">=20"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
@@ -3628,32 +3718,30 @@
}
},
"node_modules/p-limit": {
"version": "3.1.0",
"resolved": "https://registry.npmjs.org/p-limit/-/p-limit-3.1.0.tgz",
"integrity": "sha512-TYOanM3wGwNGsZN2cVTYPArw454xnXj5qmWF1bEoAc4+cU/ol7GVh7odevjp1FNHduHc3KZMcFduxU5Xc6uJRQ==",
"dev": true,
"version": "4.0.0",
"resolved": "https://registry.npmjs.org/p-limit/-/p-limit-4.0.0.tgz",
"integrity": "sha512-5b0R4txpzjPWVw/cXXUResoD4hb6U/x9BH08L7nw+GN1sezDzPdxeRvpc9c433fZhBan/wusjbCsqwqm4EIBIQ==",
"license": "MIT",
"dependencies": {
"yocto-queue": "^0.1.0"
"yocto-queue": "^1.0.0"
},
"engines": {
"node": ">=10"
"node": "^12.20.0 || ^14.13.1 || >=16.0.0"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/p-locate": {
"version": "5.0.0",
"resolved": "https://registry.npmjs.org/p-locate/-/p-locate-5.0.0.tgz",
"integrity": "sha512-LaNjtRWUBY++zB5nE/NwcaoMylSPk+S+ZHNB1TzdbMJMny6dynpAGt7X/tl/QYq3TIeE6nxHppbo2LGymrG5Pw==",
"dev": true,
"version": "6.0.0",
"resolved": "https://registry.npmjs.org/p-locate/-/p-locate-6.0.0.tgz",
"integrity": "sha512-wPrq66Llhl7/4AGC6I+cqxT07LhXvWL08LNXz1fENOw0Ap4sRZZ/gZpTTJ5jpurzzzfS2W/Ge9BY3LgLjCShcw==",
"license": "MIT",
"dependencies": {
"p-limit": "^3.0.2"
"p-limit": "^4.0.0"
},
"engines": {
"node": ">=10"
"node": "^12.20.0 || ^14.13.1 || >=16.0.0"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
@@ -4621,6 +4709,18 @@
"license": "MIT",
"peer": true
},
"node_modules/unicorn-magic": {
"version": "0.3.0",
"resolved": "https://registry.npmjs.org/unicorn-magic/-/unicorn-magic-0.3.0.tgz",
"integrity": "sha512-+QBBXBCvifc56fsbuxZQ6Sic3wqqc3WWaqxs58gvJrcOuN83HGTCwz3oS5phzU9LthRNE9VrJCFCLUgHeeFnfA==",
"license": "MIT",
"engines": {
"node": ">=18"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/update-browserslist-db": {
"version": "1.2.3",
"resolved": "https://registry.npmjs.org/update-browserslist-db/-/update-browserslist-db-1.2.3.tgz",
@@ -4850,13 +4950,12 @@
}
},
"node_modules/yocto-queue": {
"version": "0.1.0",
"resolved": "https://registry.npmjs.org/yocto-queue/-/yocto-queue-0.1.0.tgz",
"integrity": "sha512-rVksvsnNCdJ/ohGc6xgPwyN8eheCxsiLM8mxuE/t/mOVqJewPuO1miLpTHQiRgTKCLexL4MeAFVagts7HmNZ2Q==",
"dev": true,
"version": "1.2.2",
"resolved": "https://registry.npmjs.org/yocto-queue/-/yocto-queue-1.2.2.tgz",
"integrity": "sha512-4LCcse/U2MHZ63HAJVE+v71o7yOdIe4cZ70Wpf8D/IyjDKYQLV5GD46B+hSTjJsvV5PztjvHoU580EftxjDZFQ==",
"license": "MIT",
"engines": {
"node": ">=10"
"node": ">=12.20"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"

View File

@@ -23,6 +23,7 @@
"dependencies": {
"csv-parse": "^6.2.1",
"dotenv": "^17.3.1",
"find-up": "^8.0.0",
"mysql2": "^3.20.0"
},
"devDependencies": {

157
sync.js Normal file
View File

@@ -0,0 +1,157 @@
import { parse } from 'csv-parse';
import fs from 'node:fs';
import zlib from 'node:zlib';
import { pipeline } from 'node:stream/promises';
import { Writable } from 'node:stream';
import mysql from 'mysql2/promise';
import './lib/dotenv.js';
const BASE_URL = 'https://files.data.gouv.fr/geo-dvf/latest/csv';
const GEODVF_DIR = 'geodvf';
const connectionString = process.env.MYSQL;
if (!connectionString) {
throw new Error('MYSQL environment variable not set');
}
// parse mysql connection string
const url = new URL(connectionString);
const dbConfig = {
host: url.hostname,
port: url.port || 3306,
user: url.username,
password: decodeURIComponent(url.password),
database: url.pathname.slice(1),
charset: 'utf8mb4',
};
async function getYearsOnServer() {
const res = await fetch(`${BASE_URL}/`);
const html = await res.text();
const years = [];
for (const match of html.matchAll(/href="(\d{4})\/"/g)) {
years.push(Number(match[1]));
}
return years.toSorted();
}
async function getYearsInDb(connection) {
const [rows] = await connection.query('SELECT DISTINCT YEAR(date_mutation) as annee FROM dvf ORDER BY annee');
return rows.map(r => r.annee);
}
async function downloadYear(year) {
const file = `${GEODVF_DIR}/${year}.csv.gz`;
if (fs.existsSync(file)) {
console.log(` ${file} déjà présent, skip download`);
return file;
}
const url = `${BASE_URL}/${year}/full.csv.gz`;
console.log(` Téléchargement ${url} ...`);
const res = await fetch(url);
if (!res.ok) throw new Error(`HTTP ${res.status} pour ${url}`);
fs.mkdirSync(GEODVF_DIR, { recursive: true });
const dest = fs.createWriteStream(file);
await pipeline(res.body, dest);
const size = fs.statSync(file).size;
console.log(` Téléchargé ${file} (${(size / 1024 / 1024).toFixed(1)} Mo)`);
return file;
}
async function importYear(connection, year, file) {
console.log(` Parsing et insertion de ${file} ...`);
let columns = null;
let inserted = 0;
const BATCH_SIZE = 5000;
let batch = [];
const flush = async () => {
if (batch.length === 0) return;
const placeholders = batch.map(row => `(${row.map(() => '?').join(', ')})`).join(', ');
const sql = `INSERT INTO dvf (${columns.join(', ')}) VALUES ${placeholders}`;
const flat = batch.flat();
await connection.query(sql, flat);
inserted += batch.length;
if (inserted % 100_000 < BATCH_SIZE) {
process.stdout.write(` ... ${inserted.toLocaleString()} lignes insérées\r`);
}
batch = [];
};
const parser = parse({ delimiter: ',', columns: true });
const writer = new Writable({
objectMode: true,
async write(record, _encoding, callback) {
try {
if (!columns) {
columns = Object.keys(record).map(col => `\`${col}\``);
}
const values = Object.values(record).map(v => v === '' ? null : v);
batch.push(values);
if (batch.length >= BATCH_SIZE) {
await flush();
}
callback();
} catch (err) {
callback(err);
}
},
async final(callback) {
try {
await flush();
callback();
} catch (err) {
callback(err);
}
},
});
const input = fs.createReadStream(file).pipe(zlib.createGunzip());
await pipeline(input, parser, writer);
console.log(` ${inserted.toLocaleString()} lignes insérées pour ${year}`);
return inserted;
}
async function main() {
const forceYear = process.argv[2] ? Number(process.argv[2]) : null;
console.log('Connexion à la base...');
const connection = await mysql.createConnection(dbConfig);
console.log('Vérification des années sur le serveur...');
const serverYears = await getYearsOnServer();
console.log(`Années disponibles : ${serverYears.join(', ')}`);
const dbYears = await getYearsInDb(connection);
console.log(`Années en base : ${dbYears.join(', ')}`);
let missing;
if (forceYear) {
missing = [forceYear];
console.log(`Mode forcé : import de ${forceYear}`);
} else {
missing = serverYears.filter(y => !dbYears.includes(y));
}
if (missing.length === 0) {
console.log('Tout est à jour !');
await connection.end();
return;
}
console.log(`Années manquantes : ${missing.join(', ')}`);
for (const year of missing) {
console.log(`\n=== ${year} ===`);
const file = await downloadYear(year);
await importYear(connection, year, file);
}
console.log('\nTerminé !');
await connection.end();
}
await main();