up
This commit is contained in:
		
							
								
								
									
										6
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										6
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							@@ -1,3 +1,5 @@
 | 
				
			|||||||
*.csv
 | 
					*dvf/*.csv
 | 
				
			||||||
*.csv.gz
 | 
					*dvf/*.sql
 | 
				
			||||||
 | 
					*dvf/*.gz
 | 
				
			||||||
node_modules/
 | 
					node_modules/
 | 
				
			||||||
 | 
					.env
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										63
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										63
									
								
								README.md
									
									
									
									
									
								
							@@ -1,7 +1,14 @@
 | 
				
			|||||||
# DVF
 | 
					# DVF
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Il y a deux sources de données.
 | 
				
			||||||
 | 
					dvf et geodvf
 | 
				
			||||||
 | 
					geodvf contient sensiblement les mêmes données mais avec la latitude et la longitude.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## DVF
 | 
				
			||||||
 | 
					
 | 
				
			||||||
https://www.data.gouv.fr/fr/datasets/demandes-de-valeurs-foncieres/
 | 
					https://www.data.gouv.fr/fr/datasets/demandes-de-valeurs-foncieres/
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    mkdir -p dvf
 | 
				
			||||||
    curl -L https://www.data.gouv.fr/fr/datasets/r/78348f03-a11c-4a6b-b8db-2acf4fee81b1 -o dvf/2023.csv
 | 
					    curl -L https://www.data.gouv.fr/fr/datasets/r/78348f03-a11c-4a6b-b8db-2acf4fee81b1 -o dvf/2023.csv
 | 
				
			||||||
    curl -L https://www.data.gouv.fr/fr/datasets/r/87038926-fb31-4959-b2ae-7a24321c599a -o dvf/2022.csv
 | 
					    curl -L https://www.data.gouv.fr/fr/datasets/r/87038926-fb31-4959-b2ae-7a24321c599a -o dvf/2022.csv
 | 
				
			||||||
    curl -L https://www.data.gouv.fr/fr/datasets/r/817204ac-2202-4b4a-98e7-4184d154d98c -o dvf/2021.csv
 | 
					    curl -L https://www.data.gouv.fr/fr/datasets/r/817204ac-2202-4b4a-98e7-4184d154d98c -o dvf/2021.csv
 | 
				
			||||||
@@ -10,10 +17,11 @@ https://www.data.gouv.fr/fr/datasets/demandes-de-valeurs-foncieres/
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
Voila un sample des données parsées [dvf/sample.json](dvf/sample.json)
 | 
					Voila un sample des données parsées [dvf/sample.json](dvf/sample.json)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# GEODVF
 | 
					## GEODVF
 | 
				
			||||||
 | 
					
 | 
				
			||||||
https://files.data.gouv.fr/geo-dvf/latest/csv/
 | 
					https://files.data.gouv.fr/geo-dvf/latest/csv/
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    mkdir -p geodvf
 | 
				
			||||||
    curl https://files.data.gouv.fr/geo-dvf/latest/csv/2023/full.csv.gz -o geodvf/2023.csv.gz
 | 
					    curl https://files.data.gouv.fr/geo-dvf/latest/csv/2023/full.csv.gz -o geodvf/2023.csv.gz
 | 
				
			||||||
    curl https://files.data.gouv.fr/geo-dvf/latest/csv/2022/full.csv.gz -o geodvf/2022.csv.gz
 | 
					    curl https://files.data.gouv.fr/geo-dvf/latest/csv/2022/full.csv.gz -o geodvf/2022.csv.gz
 | 
				
			||||||
    curl https://files.data.gouv.fr/geo-dvf/latest/csv/2021/full.csv.gz -o geodvf/2021.csv.gz
 | 
					    curl https://files.data.gouv.fr/geo-dvf/latest/csv/2021/full.csv.gz -o geodvf/2021.csv.gz
 | 
				
			||||||
@@ -24,8 +32,53 @@ Voila un sample des données parsées [geodvf/sample.json](geodvf/sample.json)
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
# run
 | 
					# run
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Cette commande ouvre les fichiers csv et chie du json
 | 
					Cette commande ouvre les fichiers csv et chie des inserts mysql en batch
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    node parse.js dvf/2023.csv
 | 
					    echo "MYSQL=mysql://user:password@host/database?charset=utf8mb4&connectionLimit=10" > .env
 | 
				
			||||||
    node parse.js geodvf/2023.csv.gz
 | 
					    node parse.js geodvf/2023.csv.gz > geodvf/2023.sql
 | 
				
			||||||
    ...
 | 
					    pv geodvf/2023.sql | mysql -u user -ppassword -h host database
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					CREATE TABLE IF NOT EXISTS dvf (
 | 
				
			||||||
 | 
					    id_mutation VARCHAR(255),
 | 
				
			||||||
 | 
					    date_mutation DATE,
 | 
				
			||||||
 | 
					    numero_disposition VARCHAR(255),
 | 
				
			||||||
 | 
					    nature_mutation VARCHAR(255),
 | 
				
			||||||
 | 
					    valeur_fonciere DECIMAL(15, 2),
 | 
				
			||||||
 | 
					    adresse_numero VARCHAR(255),
 | 
				
			||||||
 | 
					    adresse_suffixe VARCHAR(255),
 | 
				
			||||||
 | 
					    adresse_nom_voie VARCHAR(255),
 | 
				
			||||||
 | 
					    adresse_code_voie VARCHAR(255),
 | 
				
			||||||
 | 
					    code_postal VARCHAR(255),
 | 
				
			||||||
 | 
					    code_commune VARCHAR(255),
 | 
				
			||||||
 | 
					    nom_commune VARCHAR(255),
 | 
				
			||||||
 | 
					    code_departement VARCHAR(255),
 | 
				
			||||||
 | 
					    ancien_code_commune VARCHAR(255),
 | 
				
			||||||
 | 
					    ancien_nom_commune VARCHAR(255),
 | 
				
			||||||
 | 
					    id_parcelle VARCHAR(255),
 | 
				
			||||||
 | 
					    ancien_id_parcelle VARCHAR(255),
 | 
				
			||||||
 | 
					    numero_volume VARCHAR(255),
 | 
				
			||||||
 | 
					    lot1_numero VARCHAR(255),
 | 
				
			||||||
 | 
					    lot1_surface_carrez DECIMAL(15, 2),
 | 
				
			||||||
 | 
					    lot2_numero VARCHAR(255),
 | 
				
			||||||
 | 
					    lot2_surface_carrez DECIMAL(15, 2),
 | 
				
			||||||
 | 
					    lot3_numero VARCHAR(255),
 | 
				
			||||||
 | 
					    lot3_surface_carrez DECIMAL(15, 2),
 | 
				
			||||||
 | 
					    lot4_numero VARCHAR(255),
 | 
				
			||||||
 | 
					    lot4_surface_carrez DECIMAL(15, 2),
 | 
				
			||||||
 | 
					    lot5_numero VARCHAR(255),
 | 
				
			||||||
 | 
					    lot5_surface_carrez DECIMAL(15, 2),
 | 
				
			||||||
 | 
					    nombre_lots INT,
 | 
				
			||||||
 | 
					    code_type_local VARCHAR(255),
 | 
				
			||||||
 | 
					    type_local VARCHAR(255),
 | 
				
			||||||
 | 
					    surface_reelle_bati DECIMAL(15, 2),
 | 
				
			||||||
 | 
					    nombre_pieces_principales INT,
 | 
				
			||||||
 | 
					    code_nature_culture VARCHAR(255),
 | 
				
			||||||
 | 
					    nature_culture VARCHAR(255),
 | 
				
			||||||
 | 
					    code_nature_culture_speciale VARCHAR(255),
 | 
				
			||||||
 | 
					    nature_culture_speciale VARCHAR(255),
 | 
				
			||||||
 | 
					    surface_terrain DECIMAL(15, 2),
 | 
				
			||||||
 | 
					    longitude DECIMAL(10, 6),
 | 
				
			||||||
 | 
					    latitude DECIMAL(10, 6)
 | 
				
			||||||
 | 
					);
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										13
									
								
								package-lock.json
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										13
									
								
								package-lock.json
									
									
									
										generated
									
									
									
								
							@@ -10,6 +10,7 @@
 | 
				
			|||||||
            "license": "ISC",
 | 
					            "license": "ISC",
 | 
				
			||||||
            "dependencies": {
 | 
					            "dependencies": {
 | 
				
			||||||
                "csv-parse": "^5.5.6",
 | 
					                "csv-parse": "^5.5.6",
 | 
				
			||||||
 | 
					                "dotenv": "^16.4.5",
 | 
				
			||||||
                "moment": "^2.30.1",
 | 
					                "moment": "^2.30.1",
 | 
				
			||||||
                "mysql2": "^3.10.0"
 | 
					                "mysql2": "^3.10.0"
 | 
				
			||||||
            },
 | 
					            },
 | 
				
			||||||
@@ -32,6 +33,18 @@
 | 
				
			|||||||
                "node": ">=0.10"
 | 
					                "node": ">=0.10"
 | 
				
			||||||
            }
 | 
					            }
 | 
				
			||||||
        },
 | 
					        },
 | 
				
			||||||
 | 
					        "node_modules/dotenv": {
 | 
				
			||||||
 | 
					            "version": "16.4.5",
 | 
				
			||||||
 | 
					            "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.4.5.tgz",
 | 
				
			||||||
 | 
					            "integrity": "sha512-ZmdL2rui+eB2YwhsWzjInR8LldtZHGDoQ1ugH85ppHKwpUHL7j7rN0Ti9NCnGiQbhaZ11FpR+7ao1dNsmduNUg==",
 | 
				
			||||||
 | 
					            "license": "BSD-2-Clause",
 | 
				
			||||||
 | 
					            "engines": {
 | 
				
			||||||
 | 
					                "node": ">=12"
 | 
				
			||||||
 | 
					            },
 | 
				
			||||||
 | 
					            "funding": {
 | 
				
			||||||
 | 
					                "url": "https://dotenvx.com"
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					        },
 | 
				
			||||||
        "node_modules/generate-function": {
 | 
					        "node_modules/generate-function": {
 | 
				
			||||||
            "version": "2.3.1",
 | 
					            "version": "2.3.1",
 | 
				
			||||||
            "resolved": "https://registry.npmjs.org/generate-function/-/generate-function-2.3.1.tgz",
 | 
					            "resolved": "https://registry.npmjs.org/generate-function/-/generate-function-2.3.1.tgz",
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -20,9 +20,10 @@
 | 
				
			|||||||
        "watch": "gulp watch"
 | 
					        "watch": "gulp watch"
 | 
				
			||||||
    },
 | 
					    },
 | 
				
			||||||
    "dependencies": {
 | 
					    "dependencies": {
 | 
				
			||||||
 | 
					        "csv-parse": "^5.5.6",
 | 
				
			||||||
 | 
					        "dotenv": "^16.4.5",
 | 
				
			||||||
        "moment": "^2.30.1",
 | 
					        "moment": "^2.30.1",
 | 
				
			||||||
        "mysql2": "^3.10.0",
 | 
					        "mysql2": "^3.10.0"
 | 
				
			||||||
        "csv-parse": "^5.5.6"
 | 
					 | 
				
			||||||
    },
 | 
					    },
 | 
				
			||||||
    "engines": {
 | 
					    "engines": {
 | 
				
			||||||
        "node": ">=21.0.0"
 | 
					        "node": ">=21.0.0"
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										56
									
								
								parse.js
									
									
									
									
									
								
							
							
						
						
									
										56
									
								
								parse.js
									
									
									
									
									
								
							@@ -1,38 +1,72 @@
 | 
				
			|||||||
import { parse } from 'csv-parse';
 | 
					import { parse } from 'csv-parse';
 | 
				
			||||||
import fs from 'node:fs';
 | 
					import fs from 'node:fs';
 | 
				
			||||||
import zlib from 'zlib';
 | 
					import zlib from 'zlib';
 | 
				
			||||||
 | 
					import mysql from 'mysql2';
 | 
				
			||||||
 | 
					import dotenv from 'dotenv';
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					dotenv.config();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
if (process.argv.length != 3) {
 | 
					if (process.argv.length != 3) {
 | 
				
			||||||
    throw new Error('You should give a project dir');
 | 
					    throw new Error('You should give a project dir');
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
const file = process.argv[2];
 | 
					const file = process.argv[2];
 | 
				
			||||||
const delimiter = (file.match(/\.gz$/)) ? ',' : '|';
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// csv parser
 | 
				
			||||||
 | 
					const delimiter = (file.match(/\.gz$/)) ? ',' : '|';
 | 
				
			||||||
const parser = parse({
 | 
					const parser = parse({
 | 
				
			||||||
    delimiter,
 | 
					    delimiter,
 | 
				
			||||||
    columns: true,
 | 
					    columns: true,
 | 
				
			||||||
});
 | 
					});
 | 
				
			||||||
 | 
					
 | 
				
			||||||
let lines = 0;
 | 
					// mysql to escape string
 | 
				
			||||||
 | 
					const connectionString = process.env.MYSQL;
 | 
				
			||||||
 | 
					if (!connectionString) {
 | 
				
			||||||
 | 
					    throw new Error('MYSQL environment variable not set');
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					const connection = mysql.createConnection(process.env.MYSQL);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// generate sql
 | 
				
			||||||
 | 
					let columns = [];
 | 
				
			||||||
 | 
					const flushBatch = (batch) => {
 | 
				
			||||||
 | 
					    if (batch.length === 0) return;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    const values = batch.map(row => `(${row.map(val => connection.escape(val)).join(', ')})`).join(', ');
 | 
				
			||||||
 | 
					    const sql = `INSERT INTO dvf (${columns.join(', ')}) VALUES ${values};`;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    console.log(sql);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // connection.query(sql, (error, results) => {
 | 
				
			||||||
 | 
					    //     if (error) throw error;
 | 
				
			||||||
 | 
					    //     console.log('Inserted rows:', results.affectedRows);
 | 
				
			||||||
 | 
					    // });
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// quand quelques ligne de csv sont parsées on les assemble puis on genere le sql
 | 
				
			||||||
parser.on('readable', function(){
 | 
					parser.on('readable', function(){
 | 
				
			||||||
    let record;
 | 
					    let record;
 | 
				
			||||||
 | 
					    const batch = []; 
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
    while ((record = parser.read()) !== null) {
 | 
					    while ((record = parser.read()) !== null) {
 | 
				
			||||||
        console.log(record);
 | 
					        // console.log(record);
 | 
				
			||||||
        lines++;
 | 
					
 | 
				
			||||||
 | 
					        // get columns and values to insert and escape them for sql
 | 
				
			||||||
 | 
					        if (columns.length === 0) {
 | 
				
			||||||
 | 
					            columns = Object.keys(record).map(col => connection.escapeId(col));
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					        const values = Object.values(record);
 | 
				
			||||||
 | 
					        batch.push(values);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    flushBatch(batch);
 | 
				
			||||||
});
 | 
					});
 | 
				
			||||||
parser.on('error', function(err){
 | 
					parser.on('error', function(err){
 | 
				
			||||||
    console.error(err.message);
 | 
					    console.error(err.message);
 | 
				
			||||||
});
 | 
					});
 | 
				
			||||||
parser.on('end', function(){
 | 
					parser.on('end', function(){
 | 
				
			||||||
    console.log('end', lines);
 | 
					    connection.end();
 | 
				
			||||||
});
 | 
					});
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// ouvre le fichier, et le décompresse si besoin
 | 
				
			||||||
if (file.match(/\.gz$/)) fs.createReadStream(file).pipe(zlib.createGunzip()).pipe(parser);
 | 
					if (file.match(/\.gz$/)) fs.createReadStream(file).pipe(zlib.createGunzip()).pipe(parser);
 | 
				
			||||||
else fs.createReadStream(file).pipe(parser);
 | 
					else fs.createReadStream(file).pipe(parser);
 | 
				
			||||||
 | 
					 | 
				
			||||||
// affiche la progression
 | 
					 | 
				
			||||||
const interval = setInterval(() => {
 | 
					 | 
				
			||||||
    console.log(`found ${lines} lines`);
 | 
					 | 
				
			||||||
}, 1000);
 | 
					 | 
				
			||||||
interval.unref();
 | 
					 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user