# generate object x (no output):
x <- 5
# display log(x)
log(x)
[1] 1.609438
Session 2
In R, everything is an object.
Objects have a name that is assigned with <-
(recommended) or =
.
Names have to start with a letter and include only letters, numbers, and characters such as “.” and “_”.
R is case sensitive: \(\Rightarrow Name\neq name\)!
Objects can store vectors, matrices, lists, data frames, functions…
# generate object x (no output):
x <- 5
# display log(x)
log(x)
[1] 1.609438
# object X is not defined => error message
X
Error: object 'X' not found
vec <- c(value1, value2, value3)
.rep()
.1+2:3^2 # '^2' evaluated before ':', only then '+1' is evaluated
[1] 3 4 5 6 7 8 9 10
1+2:3*4 # first ':', then '*4', then '+1'
[1] 9 13
# use brackets to avoid confusion or mistakes
(1+2):(3*4)
[1] 3 4 5 6 7 8 9 10 11 12
0
and variance 1
. Store your results in the object norm.vec
.norm.vec
.rep()
to repeat each element of norm.vec
3 times. Store the result in the object norm.vec.rep
.mean(norm.vec.rep^2)
equal to mean(norm.vec.rep)^2
?TRUE
or FALSE
.if(condition is TRUE){do this}else{do that}
.==
, different by !=
or compare them with <
and >
.&
and “OR” |
# define objects
obj1 <- 1
obj2 <- 2
obj3 <- 1 # same value as obj1
obj1 == obj2 # false statement
[1] FALSE
obj1 != obj2 # true statement
[1] TRUE
obj1 == obj2 & obj1 == obj3 # FALSE AND TRUE => FALSE
[1] FALSE
obj1 == obj2 | obj1 == obj3 # FALSE OR TRUE => TRUE
[1] TRUE
&
and |
are then applied element-wise.vec2 <- 1:5 # defines vector vec2=(1,2,3,4,5)
vec2 == 3 # =FALSE if element is not 3, =TRUE if element is 3
[1] FALSE FALSE TRUE FALSE FALSE
vec2 >= 2 & vec2 < 5 # Only TRUE for elements >=2 and <5
[1] FALSE TRUE TRUE TRUE FALSE
vec2 >= 2 | vec2 < 5 # TRUE for all elements since either >=2 or <5
[1] TRUE TRUE TRUE TRUE TRUE
""
or ''
."1"
to the numeric 1
."1"
and "2"
.as.numeric("1")
and as.numeric("2")
. What happened?1
and the character "2"
. Of which type are the elements of the vector?names()
command.avg_temp <- c(Maastricht = 14.2, Amsterdam = 13.4, Rotterdam = 13.7)
print(avg_temp) # names appear on top of elements
Maastricht Amsterdam Rotterdam
14.2 13.4 13.7
names(avg_temp) # returns names of elements
[1] "Maastricht" "Amsterdam" "Rotterdam"
# Alternatively, we can define data and names separately
temp <- c(14.2, 13.4, 13.7)
names(temp) <- cities # recall that we have defined "cities" earlier!
print(temp)
Maastricht Amsterdam Rotterdam
14.2 13.4 13.7
# return the second element of "avg_temp" defined before
avg_temp[2]
Amsterdam
13.4
# return the element corresponding to "Maastricht"
avg_temp["Maastricht"]
Maastricht
14.2
# trying to access a non-existing element yields "NA"
# ( for "not available"), i.e., a missing value
avg_temp[4]
<NA>
NA
[-k]
, we can get the vector except for the \(k\)-th element.# get the vector except for the third element
avg_temp[-3]
Maastricht Amsterdam
14.2 13.4
# now add another city to avg_temp
avg_temp["Tilburg"] <- 14.7
# now the fourth element is defined!
avg_temp[4]
Tilburg
14.7
NA
(“not available”) indicates missing values.NA
yields NA
.NaN
(“not a number”) indicates the result of a mathematically undefined operation.m
rows directly using matrix(vector,nrow=m)
.# create matrix with 3 rows; fill numbers by row
mat1 <- matrix(1:12, nrow = 3, byrow = TRUE) # by default, R fills matrices by column
mat1
[,1] [,2] [,3] [,4]
[1,] 1 2 3 4
[2,] 5 6 7 8
[3,] 9 10 11 12
rbind(v1,v2,...)
or by column by cbind(v1,v2,...)
.# create vectors v1, v2 and v3 and combine them for same result
v1 <- 1:4
v2 <- 5:8
v3 <- 9:12
mat2 <- rbind(v1, v2, v3)
mat2
[,1] [,2] [,3] [,4]
v1 1 2 3 4
v2 5 6 7 8
v3 9 10 11 12
rownames()
and colnames()
.[rownumber,colnumber]
, the k
-th row by [k,]
and the k
-th column by [,k]
.# get element in second row in third column
mat1[2,3]
[1] 7
# get second row
mat1[2,]
col1 col2 col3 col4
5 6 7 8
# get third column
mat1[,3]
row1 row2 row3
3 7 11
3. Get the data for April and May by - including only the first and second row - excluding the third row - using the names
R can do matrix “regular” algebra, and even lets you do operations that are not well-defined mathematically.
t(A)
is the transpose of the matrix A
.
# define matrix containing normal data
data.vec <- rnorm(9, mean = 0, sd = 1)
A <- matrix(data.vec, nrow = 3)
A # return A
[,1] [,2] [,3]
[1,] 0.2716964 0.2826514 0.7622134
[2,] -1.5595010 -1.1265315 -0.2985537
[3,] -0.3099461 -1.3427173 -1.3471083
t(A) # return the transpose
[,1] [,2] [,3]
[1,] 0.2716964 -1.5595010 -0.3099461
[2,] 0.2826514 -1.1265315 -1.3427173
[3,] 0.7622134 -0.2985537 -1.3471083
solve(A)
returns the inverse of an invertible matrix.solve(A) # return the inverse of A
[,1] [,2] [,3]
[1,] 1.047873 -0.6030714 0.7265578
[2,] -1.884525 -0.1217632 -1.0393055
[3,] 1.637285 0.2601225 0.1264188
*
does element-wise multiplication.%*%
does matrix multiplication .# element-wise multiplication
A * solve(A) # NOT the identity matrix
[,1] [,2] [,3]
[1,] 0.2847033 -0.170459 0.5537921
[2,] 2.9389180 0.137170 0.3102885
[3,] -0.5074700 -0.349271 -0.1702998
[,1] [,2] [,3]
[1,] 1.000000e+00 -5.551115e-17 2.775558e-17
[2,] 0.000000e+00 1.000000e+00 -3.469447e-17
[3,] -4.440892e-16 -5.551115e-17 1.000000e+00
# yields the identity (up to a small error due to the
# numerical computation of the inverse)
list
is a generic collection of objects.mylist<- list(name1=component1, name2=component2,...)
.names(mylist)
.$
(dollar sign) operator, e.g., mylist$name1
, or by position with [[]]
.mylist$city
[1] "Maastricht"
mylist[[2]] # same result
[1] "Maastricht"
data frames
are simply data sets in R terminology.data files
can contain multiple data sets.data.frame()
or transform a matrix mat
into a data frame by as.data.frame(mat)
.lm()
for regressions) need a data frame as input (see later sessions).# generate a data frame
ID <- 1:4
hourly_wage <- rnorm(n = 4, mean = 20, sd = 1) # create 4 draws from N(20,1)
city <- c("Maastricht", "Eindhoven", "Amsterdam", NA)
dats <- data.frame(ID, hourly_wage, city) # add new variable
dats
ID hourly_wage city
1 1 20.18216 Maastricht
2 2 20.37298 Eindhoven
3 3 20.68820 Amsterdam
4 4 20.96485 <NA>
$
operator.$
operator.View()
opens a data-viewer. Very useful (but difficult to demonstrate on these slides).dats$city # "city" is NA for ID 4.
[1] "Maastricht" "Eindhoven" "Amsterdam" NA
dats$city[4] <- 'Tilburg' # assign city to ID 4
dats$educ <- c(12, 21, 9, 10)
dats
ID hourly_wage city educ
1 1 20.18216 Maastricht 12
2 2 20.37298 Eindhoven 21
3 3 20.68820 Amsterdam 9
4 4 20.96485 Tilburg 10
subset(data_frame,condition)
, we can easily get a subset of the original data frame where condition
is TRUE
.# only keep individuals with at least 10 years of education
sub_dats <- subset(dats, educ > 10)
sub_dats
ID hourly_wage city educ
1 1 20.18216 Maastricht 12
2 2 20.37298 Eindhoven 21
ID
that contains the sequence 1,2,…,100.income
that contains 100 random draws from N(10,1).female
that is 1 for ID=1,...,50
and 0 otherwise. (hint: you can achieve this by using rep()
twice and combining two vectors with c()
)my_df
.View(my_df)
sub_my_df
that contains only individuals with income larger than 10.n <- 100 # set the sample size
X <- rnorm(n, mean = 1, sd = 2)# define the observed covariate X
epsilon <- rnorm(n, mean = 0, sd = 1) # define the model error
beta0 <- 1 # define true intercept
beta1 <- 2 # define true slope
Y <- beta0 + beta1 * X + epsilon # generate Y according to a linear model
# recall the formula in a bivariate model
beta1.hat <- cov(X,Y) / var(X)
beta0.hat <- mean(Y) - beta1.hat * mean(X)
# print estimators
beta0.hat
[1] 0.9852093
beta1.hat
[1] 1.974165
X
. What is the effect on beta1.hat
?epsilon
. What is the effect on beta0.hat
?X
and epsilon
. What is the effect?