insee_number_translator/pre_process/preprocess.go

155 lines
3.7 KiB
Go

package main
import (
"encoding/csv"
"encoding/json"
"fmt"
"io/ioutil"
"os"
)
func main() {
fmt.Println("This is intended for tinkerers only, not for end users.")
PreProcessRawData("data/raw_data", "data/curated_data")
}
func PreProcessRawData(sourceFolder, targetFolder string) {
err := os.MkdirAll(targetFolder, 0o755)
if err != nil {
fmt.Printf("Error, couldn't create target folder %s: %s", targetFolder, err)
return
}
err = preProcessCities(sourceFolder+"/commune.csv", targetFolder+"/cities.json")
if err != nil {
fmt.Fprintln(os.Stderr, "Error during cities pre processing")
fmt.Fprintln(os.Stderr, err.Error())
}
err = preProcessDepartments(sourceFolder+"/departement.csv", targetFolder+"/departments.json")
if err != nil {
fmt.Fprintln(os.Stderr, "Error during departments pre processing")
fmt.Fprintln(os.Stderr, err.Error())
}
err = preProcessCountries(sourceFolder+"/pays.csv", targetFolder+"/countries.json")
if err != nil {
fmt.Fprintln(os.Stderr, "Error during countries pre processing")
fmt.Fprintln(os.Stderr, err.Error())
}
fmt.Println("Finished pre-processing data.")
}
func preProcessCities(sourceFileName, targetFileName string) error {
return preProcessSimpleFile(sourceFileName, targetFileName, "COM")
}
func preProcessDepartments(sourceFileName, targetFileName string) error {
return preProcessSimpleFile(sourceFileName, targetFileName, "DEP")
}
func preProcessSimpleFile(sourceFileName, targetFileName, codeColumn string) error {
sourceFile, err := os.Open(sourceFileName)
if err != nil {
return err
}
defer sourceFile.Close()
reader := csv.NewReader(sourceFile)
line, err := reader.Read()
if err != nil {
return err
}
columnsToIndex := map[string]int{codeColumn: -1, "NCCENR": -1}
for index, header := range line {
_, present := columnsToIndex[header]
if present {
columnsToIndex[header] = index
}
}
for column, index := range columnsToIndex {
if index == -1 {
return fmt.Errorf("column %s is missing in file. Found columns: %s", column, line)
}
}
sourceItems := make(map[string]string)
lines, err := reader.ReadAll()
if err != nil {
return err
}
codeIndex := columnsToIndex[codeColumn]
nameIndex := columnsToIndex["NCCENR"]
for _, line = range lines {
code := line[codeIndex]
name := line[nameIndex]
sourceItems[code] = name
}
itemsJson, err := json.Marshal(sourceItems)
if err != nil {
return err
}
err = ioutil.WriteFile(targetFileName, itemsJson, 0o644)
if err != nil {
return err
}
return nil
}
func preProcessCountries(sourceFileName, targetFileName string) error {
sourceFile, err := os.Open(sourceFileName)
if err != nil {
return err
}
defer sourceFile.Close()
reader := csv.NewReader(sourceFile)
line, err := reader.Read()
if err != nil {
return err
}
columnsToIndex := map[string]int{"COG": -1, "LIBCOG": -1}
for index, header := range line {
_, present := columnsToIndex[header]
if present {
columnsToIndex[header] = index
}
}
for column, index := range columnsToIndex {
if index == -1 {
return fmt.Errorf("column %s is missing in file. Found columns: %s", column, line)
}
}
sourceCountries := make(map[string][]string)
lines, err := reader.ReadAll()
if err != nil {
return err
}
codeIndex := columnsToIndex["COG"]
nameIndex := columnsToIndex["LIBCOG"]
for _, line = range lines {
code := line[codeIndex]
name := line[nameIndex]
current, present := sourceCountries[code]
if !present {
current = []string{}
}
current = append(current, name)
sourceCountries[code] = current
}
citiesJson, err := json.Marshal(sourceCountries)
if err != nil {
return err
}
err = ioutil.WriteFile(targetFileName, citiesJson, 0o644)
if err != nil {
return err
}
return nil
}