# generate object x (no output):
x <- 5
# display log(x)
log(x)[1] 1.609438
Session 2
In R, everything is an object.
Objects have a name that is assigned with <- (recommended) or =.
Names have to start with a letter and include only letters, numbers, and characters such as “.” and “_”.
R is case sensitive: \(\Rightarrow Name\neq name\)!
Objects can store vectors, matrices, lists, data frames, functions…
# generate object x (no output):
x <- 5
# display log(x)
log(x)[1] 1.609438
# object X is not defined => error message
XError: object 'X' not found
vec <- c(value1, value2, value3).rep().1+2:3^2 # '^2' evaluated before ':', only then '+1' is evaluated[1] 3 4 5 6 7 8 9 10
1+2:3*4 # first ':', then '*4', then '+1'[1] 9 13
# use brackets to avoid confusion or mistakes
(1+2):(3*4) [1] 3 4 5 6 7 8 9 10 11 12
0 and variance 1. Store your results in the object norm.vec.norm.vec.rep() to repeat each element of norm.vec 3 times. Store the result in the object norm.vec.rep.mean(norm.vec.rep^2) equal to mean(norm.vec.rep)^2?TRUE or FALSE.if(condition is TRUE){do this}else{do that}.==, different by != or compare them with < and >.& and “OR” |
# define objects
obj1 <- 1
obj2 <- 2
obj3 <- 1 # same value as obj1
obj1 == obj2 # false statement[1] FALSE
obj1 != obj2 # true statement[1] TRUE
obj1 == obj2 & obj1 == obj3 # FALSE AND TRUE => FALSE[1] FALSE
obj1 == obj2 | obj1 == obj3 # FALSE OR TRUE => TRUE[1] TRUE
& and | are then applied element-wise.vec2 <- 1:5 # defines vector vec2=(1,2,3,4,5)
vec2 == 3 # =FALSE if element is not 3, =TRUE if element is 3[1] FALSE FALSE TRUE FALSE FALSE
vec2 >= 2 & vec2 < 5 # Only TRUE for elements >=2 and <5[1] FALSE TRUE TRUE TRUE FALSE
vec2 >= 2 | vec2 < 5 # TRUE for all elements since either >=2 or <5[1] TRUE TRUE TRUE TRUE TRUE
""or ''."1" to the numeric 1."1" and "2".as.numeric("1") and as.numeric("2"). What happened?1 and the character "2". Of which type are the elements of the vector?names() command.avg_temp <- c(Maastricht = 14.2, Amsterdam = 13.4, Rotterdam = 13.7)
print(avg_temp) # names appear on top of elementsMaastricht Amsterdam Rotterdam
14.2 13.4 13.7
names(avg_temp) # returns names of elements[1] "Maastricht" "Amsterdam" "Rotterdam"
# Alternatively, we can define data and names separately
temp <- c(14.2, 13.4, 13.7)
names(temp) <- cities # recall that we have defined "cities" earlier!
print(temp)Maastricht Amsterdam Rotterdam
14.2 13.4 13.7
# return the second element of "avg_temp" defined before
avg_temp[2] Amsterdam
13.4
# return the element corresponding to "Maastricht"
avg_temp["Maastricht"]Maastricht
14.2
# trying to access a non-existing element yields "NA"
# ( for "not available"), i.e., a missing value
avg_temp[4]<NA>
NA
[-k], we can get the vector except for the \(k\)-th element.# get the vector except for the third element
avg_temp[-3]Maastricht Amsterdam
14.2 13.4
# now add another city to avg_temp
avg_temp["Tilburg"] <- 14.7
# now the fourth element is defined!
avg_temp[4]Tilburg
14.7
NA (“not available”) indicates missing values.NA yields NA.NaN(“not a number”) indicates the result of a mathematically undefined operation.m rows directly using matrix(vector,nrow=m).# create matrix with 3 rows; fill numbers by row
mat1 <- matrix(1:12, nrow = 3, byrow = TRUE) # by default, R fills matrices by column
mat1 [,1] [,2] [,3] [,4]
[1,] 1 2 3 4
[2,] 5 6 7 8
[3,] 9 10 11 12
rbind(v1,v2,...) or by column by cbind(v1,v2,...).# create vectors v1, v2 and v3 and combine them for same result
v1 <- 1:4
v2 <- 5:8
v3 <- 9:12
mat2 <- rbind(v1, v2, v3)
mat2 [,1] [,2] [,3] [,4]
v1 1 2 3 4
v2 5 6 7 8
v3 9 10 11 12
rownames() and colnames().[rownumber,colnumber], the k-th row by [k,] and the k-th column by [,k].# get element in second row in third column
mat1[2,3][1] 7
# get second row
mat1[2,]col1 col2 col3 col4
5 6 7 8
# get third column
mat1[,3]row1 row2 row3
3 7 11
3. Get the data for April and May by - including only the first and second row - excluding the third row - using the names
R can do matrix “regular” algebra, and even lets you do operations that are not well-defined mathematically.
t(A) is the transpose of the matrix A.
# define matrix containing normal data
data.vec <- rnorm(9, mean = 0, sd = 1)
A <- matrix(data.vec, nrow = 3)
A # return A [,1] [,2] [,3]
[1,] -0.7569037 -0.02340064 1.124970
[2,] -0.6589236 0.22297959 1.963878
[3,] 0.6227039 -1.31762563 -1.514380
t(A) # return the transpose [,1] [,2] [,3]
[1,] -0.75690372 -0.6589236 0.6227039
[2,] -0.02340064 0.2229796 -1.3176256
[3,] 1.12496977 1.9638780 -1.5143799
solve(A) returns the inverse of an invertible matrix.solve(A) # return the inverse of A [,1] [,2] [,3]
[1,] -2.5344123 1.7095904 0.3343216
[2,] -0.2535042 -0.5020622 -0.8394019
[3,] -0.8215672 1.1398055 0.2074782
*does element-wise multiplication.%*% does matrix multiplication .# element-wise multiplication
A * solve(A) # NOT the identity matrix [,1] [,2] [,3]
[1,] 1.9183061 -0.04000552 0.3761016
[2,] 0.1670399 -0.11194962 -1.6484830
[3,] -0.5115931 -1.50183698 -0.3142008
[,1] [,2] [,3]
[1,] 1 0.000000e+00 0
[2,] 0 1.000000e+00 0
[3,] 0 -2.220446e-16 1
# yields the identity (up to a small error due to the
# numerical computation of the inverse)list is a generic collection of objects.mylist<- list(name1=component1, name2=component2,...).names(mylist).$ (dollar sign) operator, e.g., mylist$name1, or by position with [[]].mylist$city[1] "Maastricht"
mylist[[2]] # same result[1] "Maastricht"
data frames are simply data sets in R terminology.data files can contain multiple data sets.data.frame() or transform a matrix mat into a data frame by as.data.frame(mat).lm() for regressions) need a data frame as input (see later sessions).# generate a data frame
ID <- 1:4
hourly_wage <- rnorm(n = 4, mean = 20, sd = 1) # create 4 draws from N(20,1)
city <- c("Maastricht", "Eindhoven", "Amsterdam", NA)
dats <- data.frame(ID, hourly_wage, city) # add new variable
dats ID hourly_wage city
1 1 20.60223 Maastricht
2 2 19.20833 Eindhoven
3 3 19.56172 Amsterdam
4 4 19.21373 <NA>
$ operator.$ operator.View() opens a data-viewer. Very useful (but difficult to demonstrate on these slides).dats$city # "city" is NA for ID 4.[1] "Maastricht" "Eindhoven" "Amsterdam" NA
dats$city[4] <- 'Tilburg' # assign city to ID 4
dats$educ <- c(12, 21, 9, 10)
dats ID hourly_wage city educ
1 1 20.60223 Maastricht 12
2 2 19.20833 Eindhoven 21
3 3 19.56172 Amsterdam 9
4 4 19.21373 Tilburg 10
subset(data_frame,condition), we can easily get a subset of the original data frame where condition is TRUE.# only keep individuals with at least 10 years of education
sub_dats <- subset(dats, educ > 10)
sub_dats ID hourly_wage city educ
1 1 20.60223 Maastricht 12
2 2 19.20833 Eindhoven 21
ID that contains the sequence 1,2,…,100.income that contains 100 random draws from N(10,1).female that is 1 for ID=1,...,50 and 0 otherwise. (hint: you can achieve this by using rep() twice and combining two vectors with c())my_df.View(my_df)
sub_my_df that contains only individuals with income larger than 10.n <- 100 # set the sample size
X <- rnorm(n, mean = 1, sd = 2)# define the observed covariate X
epsilon <- rnorm(n, mean = 0, sd = 1) # define the model error
beta0 <- 1 # define true intercept
beta1 <- 2 # define true slope
Y <- beta0 + beta1 * X + epsilon # generate Y according to a linear model
# recall the formula in a bivariate model
beta1.hat <- cov(X,Y) / var(X)
beta0.hat <- mean(Y) - beta1.hat * mean(X)
# print estimators
beta0.hat[1] 1.166459
beta1.hat[1] 1.924542
X. What is the effect on beta1.hat?epsilon. What is the effect on beta0.hat?X and epsilon. What is the effect?