Installer des Packages¶
In [2]:
# Pour le datascience
library(tidyverse, warn.conflicts = FALSE)
-- Attaching packages --------------------------------------- tidyverse 1.3.2 -- v ggplot2 3.4.2 v purrr 1.0.2 v tibble 3.2.1 v dplyr 1.1.4 v tidyr 1.3.1 v stringr 1.5.0 v readr 2.1.5 v forcats 1.0.0 -- Conflicts ------------------------------------------ tidyverse_conflicts() -- x dplyr::filter() masks stats::filter() x dplyr::lag() masks stats::lag()
In [4]:
library(readxl)
In [5]:
# Aussi foreign
library(haven, warn.conflicts = FALSE)
In [8]:
library(labelled, warn.conflicts = FALSE)
Importer des base de données¶
Utiliser des données du système¶
In [52]:
# try() pour prevenir les bugs
# data()
# data(package = .packages(all.available = TRUE))
data(package = c("dplyr", "stringr"))
Data sets
| Package | Item | Title |
|---|---|---|
| <chr> | <chr> | <chr> |
| dplyr | band_instruments | Band membership |
| dplyr | band_instruments2 | Band membership |
| dplyr | band_members | Band membership |
| dplyr | starwars | Starwars characters |
| dplyr | storms | Storm tracks data |
| stringr | fruit | Sample character vectors for practicing string manipulations |
| stringr | sentences | Sample character vectors for practicing string manipulations |
| stringr | words | Sample character vectors for practicing string manipulations |
In [13]:
data("fruit", package = "stringr")
In [13]:
data("storms", package = "dplyr")
In [55]:
str(storms)
tibble [19,537 x 13] (S3: tbl_df/tbl/data.frame) $ name : chr [1:19537] "Amy" "Amy" "Amy" "Amy" ... $ year : num [1:19537] 1975 1975 1975 1975 1975 ... $ month : num [1:19537] 6 6 6 6 6 6 6 6 6 6 ... $ day : int [1:19537] 27 27 27 27 28 28 28 28 29 29 ... $ hour : num [1:19537] 0 6 12 18 0 6 12 18 0 6 ... $ lat : num [1:19537] 27.5 28.5 29.5 30.5 31.5 32.4 33.3 34 34.4 34 ... $ long : num [1:19537] -79 -79 -79 -79 -78.8 -78.7 -78 -77 -75.8 -74.8 ... $ status : Factor w/ 9 levels "disturbance",..: 7 7 7 7 7 7 7 7 8 8 ... $ category : num [1:19537] NA NA NA NA NA NA NA NA NA NA ... $ wind : int [1:19537] 25 25 25 25 25 25 25 30 35 40 ... $ pressure : int [1:19537] 1013 1013 1013 1013 1012 1012 1011 1006 1004 1002 ... $ tropicalstorm_force_diameter: int [1:19537] NA NA NA NA NA NA NA NA NA NA ... $ hurricane_force_diameter : int [1:19537] NA NA NA NA NA NA NA NA NA NA ...
In [14]:
storms <- as.data.frame(storms)
fruit <- as.data.frame(fruit)
In [57]:
str(storms$name)
chr [1:19537] "Amy" "Amy" "Amy" "Amy" "Amy" "Amy" "Amy" "Amy" "Amy" "Amy" ...
In [ ]:
attach(storms)
In [59]:
str(year)
num [1:19537] 1975 1975 1975 1975 1975 ...
In [60]:
detach(storms)
In [61]:
# load(file = "myenv.RData")
save(storms, file = "myenv.RData")
In [62]:
# load(file = "mydb.rda")
save(storms, file = "mydb.rda")
In [63]:
# load(file = ".RData")
save.image()
Lire les format SPSS, STATA, SAS¶
In [13]:
# Format SPSS
write_sav(storms, bd <- tempfile(fileext = ".sav"))
In [14]:
# Renommer et enregistrer dans le repértoire courant
file.rename(bd, "storms.sav")
TRUE
In [15]:
# Importation de la base SPSS
dat <- read_sav("storms.sav", col_select = c(1:5), n_max = 10)
In [16]:
# long (type décimal) est un nom réservé dans stata
write_dta(rename(storms, gpslong = long), bd <- tempfile(fileext = ".dta"))
In [17]:
file.rename(bd, "storms.dta")
TRUE
In [18]:
dat <- read_dta("storms.dta", col_select = c(1:5), n_max = 10)
In [13]:
write_xpt(storms, bd <- tempfile(fileext = ".xpt"))
In [14]:
file.rename(bd, "storms.xpt")
TRUE
In [21]:
dat <- read_xpt("storms.xpt", col_select = c(1:5), n_max = 10)
Lire un fichier texte (csv)¶
In [15]:
# write.table()
write_delim(storms, "storms.csv", delim = ",")
In [15]:
# Lire un fichier csv
dat <- read_delim(choose.files(), delim = ",", show_col_types = FALSE)
In [75]:
# Lire un fichier csv
dat <- read.delim(choose.files(), sep = ",")
In [76]:
# Lire un fichier csv
dat <- data.table::fread(choose.files(), select = c("name", "day", "category"))
In [77]:
# Lire un fichier csv
dat <- read_csv(choose.files(), show_col_types = FALSE)
In [78]:
# Lire un fichier csv
dat <- read.csv(choose.files())
In [79]:
# Sauvegarder au format csv avec point-virgule
write_delim(storms, "storms2.csv", delim = ";")
In [ ]:
# Lire un fichier csv avec point-virgule
dat <- read_csv2(choose.files(), show_col_types = FALSE)
In [81]:
# Lire un fichier csv point-virgule
dat <- read.csv2(choose.files())
In [89]:
# read_xls() and read_xlsx()
dat <- read_excel(choose.files(), range = "STORMS!C2:G10")
In [ ]:
# Impoter un fichier excel
dat <- read_excel(choose.files(), sheet = 1, range = cell_rows(2:10))
In [91]:
# Package readxl
readxl_example()
- 'clippy.xls'
- 'clippy.xlsx'
- 'datasets.xls'
- 'datasets.xlsx'
- 'deaths.xls'
- 'deaths.xlsx'
- 'geometry.xls'
- 'geometry.xlsx'
- 'type-me.xls'
- 'type-me.xlsx'
In [92]:
# Parenthèse pour afficher malgré l'assignation
readxl_example("datasets.xlsx")
'C:/Users/ibtall/anaconda3/Lib/R/library/readxl/extdata/datasets.xlsx'
In [93]:
# Importer directement le fichier
system.file("extdata", "datasets.xlsx", package = "readxl")
'C:/Users/ibtall/anaconda3/Lib/R/library/readxl/extdata/datasets.xlsx'
In [94]:
# Les feuilles du classeur excel
excel_sheets(readxl_example("datasets.xlsx"))
- 'iris'
- 'mtcars'
- 'chickwts'
- 'quakes'
In [95]:
# Importer la feuille 1
dat <- read.xlsx(readxl_example("datasets.xlsx"), sheetIndex = 1)
In [96]:
# Importer la feuille mtcars
dat <- read_excel(readxl_example("datasets.xlsx"), sheet = "mtcars")
Manipuler des données¶
Décrire la base¶
In [97]:
# Noms des variables
names(storms)
- 'name'
- 'year'
- 'month'
- 'day'
- 'hour'
- 'lat'
- 'long'
- 'status'
- 'category'
- 'wind'
- 'pressure'
- 'tropicalstorm_force_diameter'
- 'hurricane_force_diameter'
In [98]:
# Dimension de la base: lignes x colonnes
dim(storms)
- 19537
- 13
In [99]:
# Les noms des variables
dimnames(storms)[2]
-
- 'name'
- 'year'
- 'month'
- 'day'
- 'hour'
- 'lat'
- 'long'
- 'status'
- 'category'
- 'wind'
- 'pressure'
- 'tropicalstorm_force_diameter'
- 'hurricane_force_diameter'
Extraire des données¶
In [100]:
# Trois premières observations
dat <- head(storms, 3)
In [101]:
# Sept dernières observations
dat <- tail(storms, 7)
In [102]:
# Quelques observations
dat <- storms[c(2,4:6), ]
In [103]:
# Exclusion d'observations
dat <- storms[-c(2,7:19535), ]
In [104]:
# # Observations vérifiant des critères
dat <- storms[which(storms$name == 'Amy' & between(storms$day, 25, 30)), ]
In [105]:
# slice_head, slice_tail, slice_min, slice_max, slice_sample
dat <- slice(storms, 19530:n())
In [106]:
# Quelques observations aléatoirement fixées
set.seed(1234)
dat <- distinct(storms[sample(1:19537, 19537),], status, .keep_all = TRUE)
In [19]:
# Suppression de variables
dat$mouth <- NULL
dat[,c(9,12:13)] <- NULL
In [109]:
# Filtrer les observations
dt <- filter(dat, !(name %in% c("Katrina", "Allison")) | day <= 10 & !is.na(wind))
In [110]:
# Séléction de variables
dt <- dat[c("name","year","wind")]
In [111]:
# Séléction de variables
dt <- dat[c(1,2,9)]
In [112]:
# Séléction de variables
dt <- dat[names(storms) %in% c("name","year","wind")]
In [113]:
# Sélection de variables
dt <- select(dat, name, day, wind)
In [114]:
# starts_with, ends_with, contains, matches("[pt]al")
# is.numeric, is_whole, is.character
dt <- select(dat, where(is.factor) | starts_with("l"))
In [115]:
# Filtre sur les observations et sélection de variables
dt <- subset(dat, between(day,15, 30), select = c(name, year, day, status))
Trier, renommer, creer, supprimer¶
In [116]:
# Trie sur les observation
dt <- dat[with(dat, order(-year, name)), ]
In [117]:
# Trier les observations
dt <- arrange(dat, desc(year), name)
In [118]:
# Renommer les variables
dt <- names(dat)[match(c("lat", "long"), names(dat))] <- c("gpslat", "gpslong")
In [119]:
# Renommer les variables
dt <- rename(dat, pnom = name, jour = day)
In [120]:
# Renommer des variables commençant par l
dt <- rename_with(dat, toupper, starts_with("l"))
In [16]:
# Création de variable binaire
dat$duree <- ifelse(dat$year <= 2010, "ancien", "récent")
In [17]:
# Création de variable bimodale
dat$h_pres <- with(dat, ifelse(pressure < mean(pressure), 1, 2))
In [18]:
# Simulation de variable catégorielle
dat$sexe <- gl(n = 2, k = 3, length = length(dat$hour), labels = c("Homme", "Femme"))
In [19]:
# Simulation de variable entière
set.seed(123)
dat$size <- sample(x = c(1,2,3), size = length(dat$hour), replace = TRUE)
In [20]:
# Création de variables texte
dat$mot <- paste(letters[sample(1:27, 4)], sep="", collapse="")
In [ ]:
# Création de variables
mutate(dat,
h_wind = if_else(wind < mean(wind), 0, 1, 99),
windcl = case_when(wind <= 35 ~ 1, between(wind, 36, 55) ~ 2, TRUE ~ 3),
stat_b = recode(status, hurricane = "hcane", .default = levels(status)),
l_wind = log(wind),
v_unif = runif(length(hour), min = 1, max = 10),
v_norm = rnorm(length(hour), mean = 3, sd = 1),
.keep = "none"
)
In [38]:
# Ajouter des libellés aux variables
dat$name <- set_label(dat$name, "Le prénom de l'agent")
var_label(dat$hour) <- "L'heure de l'interview"
var_label(dat) <- list(day = "Jour de l'interview", wind ="Pression du Vent")
In [129]:
# Afficher les libellés
get_label(dat, name, day, wind)
- name
- 'Le prénom de l\'agent'
- day
- 'Jour de l\'interview'
- wind
- 'Pression du Vent'
In [130]:
# Afficher les libellés
attr(dat$name, "label")
'Le prénom de l\'agent'
In [ ]:
# Supprimer les libellés
var_label(dat$day) <- NULL
var_label(dat) <- NULL
Manager les facteurs¶
In [21]:
# Modifier en variables catégorielles
dat$status <- as_factor(dat$status)
dat$sexe <- as_factor(dat$sexe)
dat$duree <- as_factor(dat$duree)
dat$h_pres <- factor(dat$h_pres, levels = c(1,2), labels = c("Petit", "Grand"))
dat$size <- factor(dat$size, levels = c(1,2,3), labels = c("Petit", "Moyen", "Grand"))
In [ ]:
# Définir les modalités
dat$h_pres <- set_labels(dat$h_pres, labels = c("Léger", "Lourd"))
val_labels(dat$h_pres) <- c(Petit = 1, Grand = 2)
val_labels(dat) <- list(size = c(Petit = 1, Moyen = 2, Grand = 3),
h_pres = c(TPetit = 1, TGrand = 2))
In [ ]:
# Modifier les modalités
val_label(dat$size, 3) <- "Très Grand"
val_label(dat[, c("h_pres", "size")], 1) <- "Gros"
val_labels(dat[, c("h_pres", "size")]) <- c(small = 1, Big = 2)
In [ ]:
# Afficiher une values labelle
val_label(dat$size, 2)
In [ ]:
# Afficiher les values labelles
val_labels(dat$size)
In [29]:
attr(dat$size, "labels")
- Petit
- 1
- Moyen
- 2
- Grand
- 3
In [ ]:
# Afficiher les values labelles des variables
val_labels(dat[, c("h_pres", "size")])
In [39]:
dat$h_pres <- labelled(dat$h_pres, labels = c(Petit = 1, Grand = 2),
label = "Poids fictive")
In [27]:
dat$size <- labelled(dat$size, labels = c("Petit" = 1, "Moyen" = 2, "Grand" = 3),
label = "Taille fictive")
In [44]:
dat$status <- as_factor(dat$status)
In [67]:
# Affichage des modalités
levels(dat$status)
- 'disturbance'
- 'extratropical'
- 'hurricane'
- 'other low'
- 'subtropical depression'
- 'subtropical storm'
- 'tropical depression'
- 'tropical storm'
- 'tropical wave'
In [68]:
# Regrouper les modalités
levels(dat$status) <- c('disturbance','disturbance','hurricane','hurricane',
'disturbance','tropical','tropical','tropical','tropical')
levels(dat$status)
- 'disturbance'
- 'hurricane'
- 'tropical'
In [ ]:
# Supprimer les modalités
val_label(dat$size, 2) <- NULL
val_labels(dat$size) <- NULL
val_labels(dat[, c("h_pres", "sexe")]) <- NULL
val_labels(dat) <- NULL
Manipuler les textes¶
In [ ]:
# str_sort(string)
sort(dat$status)
In [ ]:
# str_order(string)
order(dat$status)
In [ ]:
# str_to_lower(string)
tolower(dat$name)
In [ ]:
# str_to_upper(string)
toupper(dat$name)
In [ ]:
# str_length(storms$status)
nchar(dat$name)
In [ ]:
# str_sub(string, start, end)
substr(dat$name, start = 1, stop = 3)
In [ ]:
# str_split(string, pattern)
strsplit(dat$name, split = "r")
In [ ]:
# str_dup(string, times)
strrep(dat$name, times = 2)
In [ ]:
# Numéro de la Position: str_which(string, pattern)
grep("[rR]", dat$name)
In [ ]:
# Valeur de la Position: str_subset(string, pattern)
grep("[rR]", dat$name, value = TRUE)
In [ ]:
# Logique de la position# str_detect(string, pattern)
grepl("[rR]", dat$name)
In [ ]:
# Position dans le text
regexpr("[rR]", dat$name)
In [ ]:
# str_locate_all(string, pattern)
gregexpr("[rR]", dat$name)
In [ ]:
# Donne l'indexe et la position de début [index. position] ou -1
regexec("[rR]", dat$name)
In [ ]:
# Donne l'indexe et la position de début [index. position] ou -1
gregexec("[rR]", dat$name)
In [ ]:
# Montrer l'index et le caratère str_extract(string, pattern)
regmatches(dat$name, regexpr("[rR]", dat$name))
In [ ]:
# Montrer l'index et le caratère str_extract_all(string, pattern)
regmatches(dat$name, gregexpr("[rR]", dat$name))
In [ ]:
# Montrer l'index et le caratère str_match(string, pattern)
regmatches(dat$name, regexec("[rR]", dat$name))
In [ ]:
# str_replace(string, pattern, replacement)
sub("[rR]", "A", dat$name)
In [ ]:
# str_replace_all(string, pattern, replacement)
gsub("[rR]", "A", dat$name)
In [ ]:
# str_trim(string)
trimws(dat$name, which = "left", whitespace = "[ \t\r\n]")
In [ ]:
# str_wrap(string)
strwrap(dat$name)