up
This commit is contained in:
parent
dd5d52dd56
commit
96f8264b8e
6
.gitignore
vendored
6
.gitignore
vendored
@ -1,3 +1,5 @@
|
|||||||
*.csv
|
*dvf/*.csv
|
||||||
*.csv.gz
|
*dvf/*.sql
|
||||||
|
*dvf/*.gz
|
||||||
node_modules/
|
node_modules/
|
||||||
|
.env
|
||||||
|
63
README.md
63
README.md
@ -1,7 +1,14 @@
|
|||||||
# DVF
|
# DVF
|
||||||
|
|
||||||
|
Il y a deux sources de données.
|
||||||
|
dvf et geodvf
|
||||||
|
geodvf contient sensiblement les mêmes données mais avec la latitude et la longitude.
|
||||||
|
|
||||||
|
## DVF
|
||||||
|
|
||||||
https://www.data.gouv.fr/fr/datasets/demandes-de-valeurs-foncieres/
|
https://www.data.gouv.fr/fr/datasets/demandes-de-valeurs-foncieres/
|
||||||
|
|
||||||
|
mkdir -p dvf
|
||||||
curl -L https://www.data.gouv.fr/fr/datasets/r/78348f03-a11c-4a6b-b8db-2acf4fee81b1 -o dvf/2023.csv
|
curl -L https://www.data.gouv.fr/fr/datasets/r/78348f03-a11c-4a6b-b8db-2acf4fee81b1 -o dvf/2023.csv
|
||||||
curl -L https://www.data.gouv.fr/fr/datasets/r/87038926-fb31-4959-b2ae-7a24321c599a -o dvf/2022.csv
|
curl -L https://www.data.gouv.fr/fr/datasets/r/87038926-fb31-4959-b2ae-7a24321c599a -o dvf/2022.csv
|
||||||
curl -L https://www.data.gouv.fr/fr/datasets/r/817204ac-2202-4b4a-98e7-4184d154d98c -o dvf/2021.csv
|
curl -L https://www.data.gouv.fr/fr/datasets/r/817204ac-2202-4b4a-98e7-4184d154d98c -o dvf/2021.csv
|
||||||
@ -10,10 +17,11 @@ https://www.data.gouv.fr/fr/datasets/demandes-de-valeurs-foncieres/
|
|||||||
|
|
||||||
Voila un sample des données parsées [dvf/sample.json](dvf/sample.json)
|
Voila un sample des données parsées [dvf/sample.json](dvf/sample.json)
|
||||||
|
|
||||||
# GEODVF
|
## GEODVF
|
||||||
|
|
||||||
https://files.data.gouv.fr/geo-dvf/latest/csv/
|
https://files.data.gouv.fr/geo-dvf/latest/csv/
|
||||||
|
|
||||||
|
mkdir -p geodvf
|
||||||
curl https://files.data.gouv.fr/geo-dvf/latest/csv/2023/full.csv.gz -o geodvf/2023.csv.gz
|
curl https://files.data.gouv.fr/geo-dvf/latest/csv/2023/full.csv.gz -o geodvf/2023.csv.gz
|
||||||
curl https://files.data.gouv.fr/geo-dvf/latest/csv/2022/full.csv.gz -o geodvf/2022.csv.gz
|
curl https://files.data.gouv.fr/geo-dvf/latest/csv/2022/full.csv.gz -o geodvf/2022.csv.gz
|
||||||
curl https://files.data.gouv.fr/geo-dvf/latest/csv/2021/full.csv.gz -o geodvf/2021.csv.gz
|
curl https://files.data.gouv.fr/geo-dvf/latest/csv/2021/full.csv.gz -o geodvf/2021.csv.gz
|
||||||
@ -24,8 +32,53 @@ Voila un sample des données parsées [geodvf/sample.json](geodvf/sample.json)
|
|||||||
|
|
||||||
# run
|
# run
|
||||||
|
|
||||||
Cette commande ouvre les fichiers csv et chie du json
|
Cette commande ouvre les fichiers csv et chie des inserts mysql en batch
|
||||||
|
|
||||||
node parse.js dvf/2023.csv
|
echo "MYSQL=mysql://user:password@host/database?charset=utf8mb4&connectionLimit=10" > .env
|
||||||
node parse.js geodvf/2023.csv.gz
|
node parse.js geodvf/2023.csv.gz > geodvf/2023.sql
|
||||||
...
|
pv geodvf/2023.sql | mysql -u user -ppassword -h host database
|
||||||
|
|
||||||
|
```
|
||||||
|
CREATE TABLE IF NOT EXISTS dvf (
|
||||||
|
id_mutation VARCHAR(255),
|
||||||
|
date_mutation DATE,
|
||||||
|
numero_disposition VARCHAR(255),
|
||||||
|
nature_mutation VARCHAR(255),
|
||||||
|
valeur_fonciere DECIMAL(15, 2),
|
||||||
|
adresse_numero VARCHAR(255),
|
||||||
|
adresse_suffixe VARCHAR(255),
|
||||||
|
adresse_nom_voie VARCHAR(255),
|
||||||
|
adresse_code_voie VARCHAR(255),
|
||||||
|
code_postal VARCHAR(255),
|
||||||
|
code_commune VARCHAR(255),
|
||||||
|
nom_commune VARCHAR(255),
|
||||||
|
code_departement VARCHAR(255),
|
||||||
|
ancien_code_commune VARCHAR(255),
|
||||||
|
ancien_nom_commune VARCHAR(255),
|
||||||
|
id_parcelle VARCHAR(255),
|
||||||
|
ancien_id_parcelle VARCHAR(255),
|
||||||
|
numero_volume VARCHAR(255),
|
||||||
|
lot1_numero VARCHAR(255),
|
||||||
|
lot1_surface_carrez DECIMAL(15, 2),
|
||||||
|
lot2_numero VARCHAR(255),
|
||||||
|
lot2_surface_carrez DECIMAL(15, 2),
|
||||||
|
lot3_numero VARCHAR(255),
|
||||||
|
lot3_surface_carrez DECIMAL(15, 2),
|
||||||
|
lot4_numero VARCHAR(255),
|
||||||
|
lot4_surface_carrez DECIMAL(15, 2),
|
||||||
|
lot5_numero VARCHAR(255),
|
||||||
|
lot5_surface_carrez DECIMAL(15, 2),
|
||||||
|
nombre_lots INT,
|
||||||
|
code_type_local VARCHAR(255),
|
||||||
|
type_local VARCHAR(255),
|
||||||
|
surface_reelle_bati DECIMAL(15, 2),
|
||||||
|
nombre_pieces_principales INT,
|
||||||
|
code_nature_culture VARCHAR(255),
|
||||||
|
nature_culture VARCHAR(255),
|
||||||
|
code_nature_culture_speciale VARCHAR(255),
|
||||||
|
nature_culture_speciale VARCHAR(255),
|
||||||
|
surface_terrain DECIMAL(15, 2),
|
||||||
|
longitude DECIMAL(10, 6),
|
||||||
|
latitude DECIMAL(10, 6)
|
||||||
|
);
|
||||||
|
```
|
||||||
|
13
package-lock.json
generated
13
package-lock.json
generated
@ -10,6 +10,7 @@
|
|||||||
"license": "ISC",
|
"license": "ISC",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"csv-parse": "^5.5.6",
|
"csv-parse": "^5.5.6",
|
||||||
|
"dotenv": "^16.4.5",
|
||||||
"moment": "^2.30.1",
|
"moment": "^2.30.1",
|
||||||
"mysql2": "^3.10.0"
|
"mysql2": "^3.10.0"
|
||||||
},
|
},
|
||||||
@ -32,6 +33,18 @@
|
|||||||
"node": ">=0.10"
|
"node": ">=0.10"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/dotenv": {
|
||||||
|
"version": "16.4.5",
|
||||||
|
"resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.4.5.tgz",
|
||||||
|
"integrity": "sha512-ZmdL2rui+eB2YwhsWzjInR8LldtZHGDoQ1ugH85ppHKwpUHL7j7rN0Ti9NCnGiQbhaZ11FpR+7ao1dNsmduNUg==",
|
||||||
|
"license": "BSD-2-Clause",
|
||||||
|
"engines": {
|
||||||
|
"node": ">=12"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"url": "https://dotenvx.com"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/generate-function": {
|
"node_modules/generate-function": {
|
||||||
"version": "2.3.1",
|
"version": "2.3.1",
|
||||||
"resolved": "https://registry.npmjs.org/generate-function/-/generate-function-2.3.1.tgz",
|
"resolved": "https://registry.npmjs.org/generate-function/-/generate-function-2.3.1.tgz",
|
||||||
|
@ -20,9 +20,10 @@
|
|||||||
"watch": "gulp watch"
|
"watch": "gulp watch"
|
||||||
},
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
|
"csv-parse": "^5.5.6",
|
||||||
|
"dotenv": "^16.4.5",
|
||||||
"moment": "^2.30.1",
|
"moment": "^2.30.1",
|
||||||
"mysql2": "^3.10.0",
|
"mysql2": "^3.10.0"
|
||||||
"csv-parse": "^5.5.6"
|
|
||||||
},
|
},
|
||||||
"engines": {
|
"engines": {
|
||||||
"node": ">=21.0.0"
|
"node": ">=21.0.0"
|
||||||
|
56
parse.js
56
parse.js
@ -1,38 +1,72 @@
|
|||||||
import { parse } from 'csv-parse';
|
import { parse } from 'csv-parse';
|
||||||
import fs from 'node:fs';
|
import fs from 'node:fs';
|
||||||
import zlib from 'zlib';
|
import zlib from 'zlib';
|
||||||
|
import mysql from 'mysql2';
|
||||||
|
import dotenv from 'dotenv';
|
||||||
|
|
||||||
|
dotenv.config();
|
||||||
|
|
||||||
if (process.argv.length != 3) {
|
if (process.argv.length != 3) {
|
||||||
throw new Error('You should give a project dir');
|
throw new Error('You should give a project dir');
|
||||||
}
|
}
|
||||||
const file = process.argv[2];
|
const file = process.argv[2];
|
||||||
const delimiter = (file.match(/\.gz$/)) ? ',' : '|';
|
|
||||||
|
|
||||||
|
// csv parser
|
||||||
|
const delimiter = (file.match(/\.gz$/)) ? ',' : '|';
|
||||||
const parser = parse({
|
const parser = parse({
|
||||||
delimiter,
|
delimiter,
|
||||||
columns: true,
|
columns: true,
|
||||||
});
|
});
|
||||||
|
|
||||||
let lines = 0;
|
// mysql to escape string
|
||||||
|
const connectionString = process.env.MYSQL;
|
||||||
|
if (!connectionString) {
|
||||||
|
throw new Error('MYSQL environment variable not set');
|
||||||
|
}
|
||||||
|
|
||||||
|
const connection = mysql.createConnection(process.env.MYSQL);
|
||||||
|
|
||||||
|
// generate sql
|
||||||
|
let columns = [];
|
||||||
|
const flushBatch = (batch) => {
|
||||||
|
if (batch.length === 0) return;
|
||||||
|
|
||||||
|
const values = batch.map(row => `(${row.map(val => connection.escape(val)).join(', ')})`).join(', ');
|
||||||
|
const sql = `INSERT INTO dvf (${columns.join(', ')}) VALUES ${values};`;
|
||||||
|
|
||||||
|
console.log(sql);
|
||||||
|
|
||||||
|
// connection.query(sql, (error, results) => {
|
||||||
|
// if (error) throw error;
|
||||||
|
// console.log('Inserted rows:', results.affectedRows);
|
||||||
|
// });
|
||||||
|
};
|
||||||
|
|
||||||
|
// quand quelques ligne de csv sont parsées on les assemble puis on genere le sql
|
||||||
parser.on('readable', function(){
|
parser.on('readable', function(){
|
||||||
let record;
|
let record;
|
||||||
|
const batch = [];
|
||||||
|
|
||||||
while ((record = parser.read()) !== null) {
|
while ((record = parser.read()) !== null) {
|
||||||
console.log(record);
|
// console.log(record);
|
||||||
lines++;
|
|
||||||
|
// get columns and values to insert and escape them for sql
|
||||||
|
if (columns.length === 0) {
|
||||||
|
columns = Object.keys(record).map(col => connection.escapeId(col));
|
||||||
}
|
}
|
||||||
|
const values = Object.values(record);
|
||||||
|
batch.push(values);
|
||||||
|
}
|
||||||
|
|
||||||
|
flushBatch(batch);
|
||||||
});
|
});
|
||||||
parser.on('error', function(err){
|
parser.on('error', function(err){
|
||||||
console.error(err.message);
|
console.error(err.message);
|
||||||
});
|
});
|
||||||
parser.on('end', function(){
|
parser.on('end', function(){
|
||||||
console.log('end', lines);
|
connection.end();
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// ouvre le fichier, et le décompresse si besoin
|
||||||
if (file.match(/\.gz$/)) fs.createReadStream(file).pipe(zlib.createGunzip()).pipe(parser);
|
if (file.match(/\.gz$/)) fs.createReadStream(file).pipe(zlib.createGunzip()).pipe(parser);
|
||||||
else fs.createReadStream(file).pipe(parser);
|
else fs.createReadStream(file).pipe(parser);
|
||||||
|
|
||||||
// affiche la progression
|
|
||||||
const interval = setInterval(() => {
|
|
||||||
console.log(`found ${lines} lines`);
|
|
||||||
}, 1000);
|
|
||||||
interval.unref();
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user