vecteur <- scan("https://r-stat-sc-donnees.github.io/donnees.csv",what="",sep=";")
Read 20 items
vecteur
[1] "individu" "taille" "poids" "pointure" "sexe" "roger" "184" "80" "44" "M" "théodule" "175,5"
[13] "78" "43" "M" "nicolas" "158" "72" "42" "M"
nomcol <- vecteur[2:5]
nomcol
[1] "taille" "poids" "pointure" "sexe"
vecteur <- gsub(",",".",vecteur)
matbrut <- matrix(vecteur,nrow=4,ncol=5,byrow=TRUE)
matbrut
[,1] [,2] [,3] [,4] [,5]
[1,] "individu" "taille" "poids" "pointure" "sexe"
[2,] "roger" "184" "80" "44" "M"
[3,] "théodule" "175.5" "78" "43" "M"
[4,] "nicolas" "158" "72" "42" "M"
nomlign <- matbrut[-1,1]
nomlign
[1] "roger" "théodule" "nicolas"
donnees <- matbrut[-1,-1]
essai <- data.frame(donnees)
summary(essai)
X1 X2 X3 X4
158 :1 72:1 42:1 M:3
175.5:1 78:1 43:1
184 :1 80:1 44:1
donnees <- data.frame(donnees)
colnames(donnees) <- nomcol
rownames(donnees) <- nomlign
summary(donnees)
taille poids pointure sexe
158 :1 72:1 42:1 M:3
175.5:1 78:1 43:1
184 :1 80:1 44:1
Les trois premières variables sont des facteurs, il faut donc les convertir en numérique:
donnees[,1] <- as.numeric(as.character(donnees[,1]))
donnees[,2] <- as.numeric(as.character(donnees[,2]))
donnees[,3] <- as.numeric(as.character(donnees[,3]))
summary(donnees)
taille poids pointure sexe
Min. :158.0 Min. :72.00 Min. :42.0 M:3
1st Qu.:166.8 1st Qu.:75.00 1st Qu.:42.5
Median :175.5 Median :78.00 Median :43.0
Mean :172.5 Mean :76.67 Mean :43.0
3rd Qu.:179.8 3rd Qu.:79.00 3rd Qu.:43.5
Max. :184.0 Max. :80.00 Max. :44.0
test1 <- read.table("https://r-stat-sc-donnees.github.io/test1.csv",dec=",",sep=";",header=TRUE)
summary(test1)
CLONE B IN HT19 C19 HT29
1-105 :9 Min. :1 Min. :1.000 Min. : 3.000 Min. : 5.00 Min. : 4.50
1-41 :9 1st Qu.:1 1st Qu.:2.750 1st Qu.: 7.968 1st Qu.:18.75 1st Qu.:11.38
18-428:9 Median :1 Median :4.500 Median : 9.055 Median :21.50 Median :12.75
18-429:5 Mean :1 Mean :4.688 Mean : 8.556 Mean :21.75 Mean :11.91
3rd Qu.:1 3rd Qu.:7.000 3rd Qu.: 9.925 3rd Qu.:26.00 3rd Qu.:13.75
Max. :1 Max. :9.000 Max. :11.300 Max. :34.00 Max. :15.25
test1prn <- read.table("https://r-stat-sc-donnees.github.io/test1.prn",header=TRUE)
summary(test1prn)
CLONE B IN HT19 C19 HT29
1-105 :9 Min. :1 Min. :1.000 Min. : 3.000 Min. : 5.00 Min. : 4.50
1-41 :9 1st Qu.:1 1st Qu.:3.000 1st Qu.: 8.150 1st Qu.:19.00 1st Qu.:11.50
18-428:9 Median :1 Median :5.000 Median : 9.110 Median :22.00 Median :12.75
18-429:6 Mean :1 Mean :4.727 Mean : 8.595 Mean :21.85 Mean :11.97
3rd Qu.:1 3rd Qu.:7.000 3rd Qu.: 9.900 3rd Qu.:26.00 3rd Qu.:13.75
Max. :1 Max. :9.000 Max. :11.300 Max. :34.00 Max. :15.25
test2 <- read.table("https://r-stat-sc-donnees.github.io/test2.csv",sep=";",header=TRUE,na.strings="")
summary(test2)
CLONE B IN HT19 C19 HT29
1-105 : 18 Min. :1.000 Min. :1 Min. : 0.890 Min. : 3.00 Min. : 1.96
1-41 : 18 1st Qu.:1.000 1st Qu.:3 1st Qu.: 7.350 1st Qu.:17.00 1st Qu.:10.75
18-428 : 18 Median :1.000 Median :5 Median : 9.320 Median :23.00 Median :12.88
18-429 : 18 Mean :1.234 Mean :5 Mean : 8.846 Mean :22.08 Mean :12.20
18-430 : 18 3rd Qu.:1.000 3rd Qu.:7 3rd Qu.:10.500 3rd Qu.:28.00 3rd Qu.:14.00
(Other):908 Max. :2.000 Max. :9 Max. :14.200 Max. :37.00 Max. :17.50
NA's : 1 NA's :15 NA's :20 NA's :15
test3 <- read.table("https://r-stat-sc-donnees.github.io/test3.csv",sep=";",header=TRUE,na.strings=".")
summary(test3)
CLONE B IN HT19 C19 HT29
1-105 : 18 Min. :1.000 Min. :1 Min. : 0.89 Min. : 3.00 Min. : 1.96
1-41 : 18 1st Qu.:1.000 1st Qu.:3 1st Qu.: 7.35 1st Qu.:17.00 1st Qu.:10.75
18-428 : 18 Median :1.000 Median :5 Median : 9.32 Median :23.00 Median :12.88
18-429 : 18 Mean :1.234 Mean :5 Mean : 10.47 Mean :22.08 Mean :12.20
18-430 : 18 3rd Qu.:1.000 3rd Qu.:7 3rd Qu.: 10.50 3rd Qu.:28.00 3rd Qu.:14.00
18-438 : 18 Max. :2.000 Max. :9 Max. :1142.00 Max. :37.00 Max. :17.50
(Other):891 NA's :15 NA's :20 NA's :15
ski <- read.table("https://r-stat-sc-donnees.github.io/test4.csv", sep="|", skip=2, header=TRUE,
row.names=1)
summary(ski)
age gender first.time.skiing
Min. :24.00 Min. :0.00 1980-05-01:1
1st Qu.:28.00 1st Qu.:0.00 1982-01-31:1
Median :32.00 Median :0.00 1992-01-15:1
Mean :30.38 Mean :0.25 2003-03-16:1
3rd Qu.:33.00 3rd Qu.:0.25 2005-02-26:1
Max. :33.00 Max. :1.00 2006-03-04:1
(Other) :2
ski2<-read.table("https://r-stat-sc-donnees.github.io/test4.csv",sep="|",skip=2,header=TRUE,
row.names=1,colClasses=c("character","numeric",
"factor","Date"))
summary(ski2)
age gender first.time.skiing
Min. :24.00 0:6 Min. :1980-05-01
1st Qu.:28.00 1:2 1st Qu.:1989-07-20
Median :32.00 Median :2004-03-06
Mean :30.38 Mean :1998-06-01
3rd Qu.:33.00 3rd Qu.:2006-12-02
Max. :33.00 Max. :2009-03-06
etat1 <- read.table("https://r-stat-sc-donnees.github.io/etat1.csv",sep=";",header=TRUE)
etat2 <- read.table("https://r-stat-sc-donnees.github.io/etat2.csv",sep=",",header=TRUE)
etat3 <- read.table("https://r-stat-sc-donnees.github.io/etat3.csv",row.names=1,header=TRUE)
etat13 <- merge(etat1,etat3,by="region")
etat123 <- merge(etat2,etat13,by="etat")
head(etat13)
region etat vote
1 North Central Indiana Elephant
2 North Central Iowa Elephant
3 North Central Michigan Elephant
4 North Central Minnesota Elephant
5 North Central Illinois Elephant
6 North Central Nebraska Elephant
head(etat123)
etat Population Income Illiteracy Life.Exp Murder HS.Grad Frost Area region vote
1 Alabama 3615 3624 2.1 69.05 15.1 41.3 20 50708 South Elephant
2 Alaska 365 6315 1.5 69.31 11.3 66.7 152 566432 West Donkey
3 Arizona 2212 4530 1.8 70.55 7.8 58.1 15 113417 West Donkey
4 Arkansas 2110 3378 1.9 70.66 10.1 39.9 65 51945 South Elephant
5 California 21198 5114 1.1 71.71 10.3 62.6 20 156361 West Donkey
6 Colorado 2541 4884 0.7 72.06 6.8 63.9 166 103766 West Donkey
fusion1 <- read.table("https://r-stat-sc-donnees.github.io/fusion1.csv",sep=";",dec=",",header=TRUE)
summary(fusion1)
yhat1 yhat3 yhat2 yhat4 yhat5
Min. :-0.46520 Min. :-0.6267 Min. :-2.1563 Min. :0.5374 Min. :0.07706
1st Qu.:-0.37217 1st Qu.: 0.2906 1st Qu.:-1.1357 1st Qu.:0.9553 1st Qu.:0.17804
Median :-0.26458 Median : 1.1447 Median :-0.8284 Median :1.4079 Median :0.25966
Mean :-0.26641 Mean : 0.9956 Mean :-0.9200 Mean :1.3332 Mean :0.27391
3rd Qu.:-0.17349 3rd Qu.: 1.5200 3rd Qu.:-0.6037 3rd Qu.:1.6578 3rd Qu.:0.37683
Max. :-0.03419 Max. : 2.8496 Max. :-0.2903 Max. :2.1196 Max. :0.48926
fusion2 <- read.table("https://r-stat-sc-donnees.github.io/fusion2.csv",sep=";",dec=",",header=TRUE)
summary(fusion2)
Rhamnos Fucos Arabinos Xylos Mannos
Min. :-0.53483 Min. :-2.3026 Min. :-0.77403 Min. :0.4243 Min. :0.02623
1st Qu.:-0.35127 1st Qu.:-2.0090 1st Qu.:-0.24294 1st Qu.:0.9477 1st Qu.:0.15794
Median :-0.10105 Median :-1.4524 Median :-0.03542 Median :1.1553 Median :0.49551
Mean : 0.09249 Mean :-1.4028 Mean : 0.15860 Mean :1.1117 Mean :0.51873
3rd Qu.: 0.51844 3rd Qu.:-0.7409 3rd Qu.: 0.50681 3rd Qu.:1.3142 3rd Qu.:0.81277
Max. : 1.34470 Max. :-0.2829 Max. : 1.39122 Max. :1.7882 Max. :1.14627
fusion1 <- fusion1[,c("yhat1","yhat3")]
fusion2 <- fusion2[,c("Rhamnos","Arabinos")]
don <- cbind(fusion1,fusion2)
yres1 <- don[,"yhat1"]-don[,"Rhamnos"]
yres2 <- don[,"yhat3"]-don[,"Arabinos"]
don <- cbind.data.frame(don,yres1,yres2)
names(don)
[1] "yhat1" "yhat3" "Rhamnos" "Arabinos" "yres1" "yres2"
head(don)
yhat1 yhat3 Rhamnos Arabinos yres1 yres2
1 -0.3701393 1.2683995 -0.3139029 1.391224687 -0.05623645 -0.12282522
2 -0.3759516 1.3191965 -0.4696197 1.279713657 0.09366813 0.03948288
3 -0.3276299 1.2206488 -0.4609091 1.172999031 0.13327920 0.04764980
4 -0.1751021 0.8299855 -0.3368723 0.550949775 0.16177021 0.27903576
5 -0.4216596 0.1009679 -0.3376569 0.009999835 -0.08400265 0.09096808
6 -0.4193860 1.2287143 -0.3876940 1.387992918 -0.03169209 -0.15927860
tabl <- table(Xqual)
tabl/sum(tabl)
Xqual
+ de 80 0-10.11-20.21-30 31-40 41-50.51-60.61-70 71-80
0.23529412 0.10588235 0.23529412 0.05882353 0.36470588
modalites <- levels(Xqual)
selecti <- (tabl/sum(tabl))<0.05
modalites[selecti]
character(0)
lesquels <- modalites[!selecti]
prov <- factor(Xqual[(Xqual%in%lesquels)],levels=lesquels)
prov <- table(prov)
proba <- prov/sum(prov)
proba
prov
+ de 80 0-10.11-20.21-30 31-40 41-50.51-60.61-70 71-80
0.23529412 0.10588235 0.23529412 0.05882353 0.36470588
for (j in modalites[selecti]) {
## tirages dans les modalités au hasard et remplacement
if (length(lesquels)==1) stop("1 seule modalite\n") else
Xqual[Xqual==j] <- sample(lesquels,sum(Xqual==j),
replace=TRUE, prob = proba)
}
Xqualvent <- factor(as.character(Xqual))
Xqualvent
[1] 0-10.11-20.21-30 0-10.11-20.21-30 0-10.11-20.21-30 0-10.11-20.21-30 0-10.11-20.21-30 0-10.11-20.21-30 0-10.11-20.21-30
[8] 0-10.11-20.21-30 0-10.11-20.21-30 31-40 31-40 31-40 31-40 31-40
[15] 31-40 31-40 31-40 31-40 31-40 31-40 31-40
[22] 31-40 31-40 31-40 31-40 31-40 31-40 31-40
[29] 31-40 41-50.51-60.61-70 41-50.51-60.61-70 41-50.51-60.61-70 41-50.51-60.61-70 41-50.51-60.61-70 71-80
[36] 71-80 71-80 71-80 71-80 71-80 71-80 71-80
[43] 71-80 71-80 71-80 71-80 71-80 71-80 71-80
[50] 71-80 71-80 71-80 71-80 71-80 71-80 71-80
[57] 71-80 71-80 71-80 71-80 71-80 71-80 71-80
[64] 71-80 71-80 + de 80 + de 80 + de 80 + de 80 + de 80
[71] + de 80 + de 80 + de 80 + de 80 + de 80 + de 80 + de 80
[78] + de 80 + de 80 + de 80 + de 80 + de 80 + de 80 + de 80
[85] + de 80
Levels: + de 80 0-10.11-20.21-30 31-40 41-50.51-60.61-70 71-80
Xqual <- factor(c(rep("0-10",1),rep("11-20",3),rep("21-30",5),
rep("31-40",20),rep("41-50",2),rep("51-60",2),rep("61-70",1),
rep("71-80",31),rep("+ de 80",20)))
tabl <- table(Xqual)
tabl/sum(tabl)
Xqual
+ de 80 0-10 11-20 21-30 31-40 41-50 51-60 61-70 71-80
0.23529412 0.01176471 0.03529412 0.05882353 0.23529412 0.02352941 0.02352941 0.01176471 0.36470588
p <- 0.05
selecti <- (tabl/sum(tabl))<p
mod <- levels(Xqual)
mod[selecti]
[1] "0-10" "11-20" "41-50" "51-60" "61-70"
numero <- which(selecti)
while(any((tabl/sum(tabl))<p)) {
## prenons la premiere modalite dont l'effectif est trop faible
j <- which(((tabl/sum(tabl))<p))[1]
K <- length(mod) # effectif des modalites mis à jour
## fusion avec modalite d'apres ou d'avant pour la derniere
if (j<K) {
if ((j>1)&(j<K-1)) {
levels(Xqual) <- c(mod[1:(j-1)],paste(mod[j],
mod[j+1],sep="."),paste(mod[j],mod[j+1],sep="."),
mod[j+2:K])}
if (j==1) {
levels(Xqual) <- c(paste(mod[j],mod[j+1],sep="."),
paste(mod[j],mod[j+1],sep="."),mod[j+2:K]) }
if (j==(K-1)) {
levels(Xqual) <- c(mod[1:(j-1)],paste(mod[j],
mod[j+1],sep="."),paste(mod[j],mod[j+1],sep=".")) }
} else {
levels(Xqual) <- c(mod[1:(j-2)],paste(mod[j-1],
mod[j],sep="."),paste(mod[j-1],mod[j],sep="."))
}
tabl <- table(Xqual) ## mise à jour de la table
mod <- levels(Xqual) # et des modalites
}
Xqual
[1] 0-10.11-20.21-30 0-10.11-20.21-30 0-10.11-20.21-30 0-10.11-20.21-30 0-10.11-20.21-30 0-10.11-20.21-30 0-10.11-20.21-30
[8] 0-10.11-20.21-30 0-10.11-20.21-30 31-40 31-40 31-40 31-40 31-40
[15] 31-40 31-40 31-40 31-40 31-40 31-40 31-40
[22] 31-40 31-40 31-40 31-40 31-40 31-40 31-40
[29] 31-40 41-50.51-60.61-70 41-50.51-60.61-70 41-50.51-60.61-70 41-50.51-60.61-70 41-50.51-60.61-70 71-80
[36] 71-80 71-80 71-80 71-80 71-80 71-80 71-80
[43] 71-80 71-80 71-80 71-80 71-80 71-80 71-80
[50] 71-80 71-80 71-80 71-80 71-80 71-80 71-80
[57] 71-80 71-80 71-80 71-80 71-80 71-80 71-80
[64] 71-80 71-80 + de 80 + de 80 + de 80 + de 80 + de 80
[71] + de 80 + de 80 + de 80 + de 80 + de 80 + de 80 + de 80
[78] + de 80 + de 80 + de 80 + de 80 + de 80 + de 80 + de 80
[85] + de 80
Levels: + de 80 0-10.11-20.21-30 31-40 41-50.51-60.61-70 71-80
conting <- matrix(c(2,1,3,0,0,4),2,3)
colnames(conting) <- c("Ang","Mer","Tex")
rownames(conting) <- c("Faible","Forte")
tabmat <- matrix("",length(conting),3)
tabmat[,3] <- as.vector(conting)
tabmat[,2] <- rep(rownames(conting),ncol(conting))
tabmat[,1] <- rep(colnames(conting),each=nrow(conting))
tabframe <- data.frame(tabmat)
tabframe[,3] <- as.numeric(as.character(tabframe[,3]))
tabframe
X1 X2 X3
1 Ang Faible 2
2 Ang Forte 1
3 Mer Faible 3
4 Mer Forte 0
5 Tex Faible 0
6 Tex Forte 4
n <- sum(tabframe[,3])
nbefac <- ncol(tabframe)-1
tabcomplet <- matrix("",n,nbefac)
iter <- 1
for (i in 1:nrow(tabframe)) {
if (tabframe[i,3]>0) {
for (j in 1:tabframe[i,3]) {
tabcomplet[iter,] <- tabmat[i,-ncol(tabframe)]
iter <- iter+1
}
}
}
data.frame(tabcomplet)
X1 X2
1 Ang Faible
2 Ang Faible
3 Ang Forte
4 Mer Faible
5 Mer Faible
6 Mer Faible
7 Tex Forte
8 Tex Forte
9 Tex Forte
10 Tex Forte
La matrice tabmat est utilisée dans l’affectation à la ligne de tabcomplet. L’affectation directe d’une ligne du data-frame n’est pas possible. En effet, un data-frame est une liste et une ligne d’un data-frame aussi et il est impossible d’affecter une liste dans une ligne de matrice, qui, elle, est un vecteur.