##------------------------------------------------------------------------##
## Script for Bio271: Introduction to R                                   ##
##    Maoying Wu                                                          ##
##    Fall 2010                                                           ##
##------------------------------------------------------------------------##


### 1. Download R from
##      http://cbb.sjtu.edu.cn/~mywu/bi217/R-2.11.1-win32.exe
### 2. Install R on your computer
### 3. Start R GUI Environment
# To change working directories:
#   on Windows, go to File menu, Change Directory
#   on UNIX (obvious), just call R when you're in the directory you want
#   or use the command setwd("/where/you/want/to/go")

### 4. Help topic
#   To load the help files for the solve command, use either
?solve
# To search for all help files that discuss solve, use
help.search("solve")
# At the bottom of the help file for solve, there are some examples.
# To execute these examples for the solve command, just use:
example(solve)


### 5. Assignments
# R has two types of elementary commands.
# 1. We can just compute a value and display the value, for instance,
1/3
# 2. We can store the value of a computation in a variable, for instance, 
z <- 1/3
# In case 2 above, the value of the computation won't be displayed.
# If we want to see what value the computation had,
z
# Assignment operators work to the left or to the right.
# So we could type
x <- 1/2
# or we could type
1/2 -> x
# or we could type
assign("x", 37)
# With any of these three ways, displaying the value of x gives the same thing.
x

### 6. Variables
# To see a list of the currently-defined variables we can type
objects()
# If we define one more variable (which was not previously defined)...
y <- 37
# ... now it will show up in our list of objects
objects()
# So objects() shows us a list of all variables in the namespace,
# i.e., everything we're working with
# The command ls() does the same thing as objects().
ls()
# The command rm() allows us to remove objects, e.g.,
rm(y)


# 7. VECTORS
# The following will concatenate values into a vector,
#  and store the result into x.
x <- c(10.4, 5.6, 3.1, 6.4, 21.7)
# We can display the vector x, which has length 5.
x
# We can also concatenate vectors, as follows,
y <- c(x,0,x)
# so that y is a new vector of length 11.
# In fact, y contains the entries of x in positions 1 through 5,
#  and then 0 in the sixth entry,
#  and then the entries of x in positions 7 through 11.
# Let's display y just to make sure.
y
# recycling rule
# So at this point x is a vector of length 5
#  and y is a vector of length 11.


### 8. Additive Rule for vectors
# Let's display x and y and then add a linear combination of the vectors.
# Let's also add a 1 to each entry of the result.  We do this with
v <- 3*x + y + 1
# It seems a little funny to add vectors of different lengths.
# Here, 3*x is a vector of length 5,
#  y is a vector of length 11,
#  and 1 is a vector of length 1.
# The "recycling rule" is used to perform the addition.
# Recycling means, whenever a vector isn't long enough, it is repeated
#  as many times as necessary.  So x is repeated 2.2 times,
#  and 1 is repeated 11 times.  This gives us
v <- 3*x + y + 1
# R gives a warning if the longest vector's length
#  is not an integer multiple of each shorter vector length.
# We can verify that
#  v[1] equals 3*x[1] + y[1] + 1
#  v[2] equals 3*x[2] + y[2] + 1
#  v[3] equals 3*x[3] + y[3] + 1
#  v[4] equals 3*x[4] + y[4] + 1
#  v[5] equals 3*x[5] + y[5] + 1
#  v[6] equals 3*x[1] + y[6] + 1
#  v[7] equals 3*x[2] + y[7] + 1
#  v[8] equals 3*x[3] + y[8] + 1
#  v[9] equals 3*x[4] + y[9] + 1
#  v[10] equals 3*x[5] + y[10] + 1
#  v[11] equals 3*x[1] + y[11] + 1
# elements of x are repeated until x is long enough
# y is actually added to x and then x again


### 9. Concatenate two or more vectors
# Let's do another example.  Let's make y a vector of length 10,
#  and z a vector of length 2.
y <- c(x,x)
y
z <- c(1,2)
z
# Then we'll add the two vectors.
y + z
# We see z is recycled, just appended to itself a total of 5 times in a row
# and then is added to y.  We get a vector of length 10
# We can verify that
#   The 1st component of y + z equals y[1] + z[1]
#   The 2nd component of y + z equals y[2] + z[2]
#   The 3rd component of y + z equals y[3] + z[1]
#   The 4th component of y + z equals y[4] + z[2]
#   The 5th component of y + z equals y[5] + z[1]
#   The 6th component of y + z equals y[6] + z[2]
#   The 7th component of y + z equals y[7] + z[1]
#   The 8th component of y + z equals y[8] + z[2]
#   The 9th component of y + z equals y[9] + z[1]
#   The 10th component of y + z equals y[10] + z[2]


### 10. Mathematical operations and functions
# Lots of the usual mathematical symbols make sense in R.  For instance,
# +, -, *, /, ^, %%, %/%
y - z
37 * 1000
10/87
2^3
# many common arithmetic functions
sin(3.14)
sin(3.1415)
pi
sin(pi)
# don't get exactly 0.  We're doing stuff numerically here.
# The help pages for related functions are given on the same help page.
# For instance,
?sin
# also displays the help for cos, tan, acos, etc.
?cos
# give the same result!
# Similarly,
?log
# and
?exp
# give the same result.
# Other functions include abs, acos, acosh, asin, asinh, atan, atanh,
#  ceiling, cos, cosh, cummax, cummin, cumprod, cumsum, digamma, 
#  exp, expml [see exp], floor, gamma, lgamma, log, loglp [see log], log10,
#  round, sign, signif, sin, sinh, sqrt, tan, tanh, trigamma, trunc


#### 12. Summary of the vectors
# For a vector x,
x
# we can also compute functions of a vector too, for example
# The length() command is one of the more frequent commands we use
length(x)
max(x)
min(x)
range(x)
prod(x)
sum(x)
mean(x)
var(x)
sort(x)
# order is used to return the permutation needed to put x in order, e.g.,
x <- c(1,1,3,2,1,1,2,3,4,3)
order(x)
# sort.list(x) does the same thing
sort.list(x)
# order(x,y) puts x in order but uses y to break ties among the elements of x
# order(x,y,z) is similar.  Here x,y,z should be vectors of the same length.
# pmax and pmin return vectors, with each component being the largest or
#  smallest components of the corresponding vectors.  For example
x <- c(1,2,3,5,7)
y <- c(37,22,3,9,2)
z <- c(6,-3,5,1,6)
# Then the ith component of pmax(x,y,z) is equal to the largest of the
#  ith components of x, y, and z
pmax(x,y,z)
# The following gives NaN and a warning
sqrt(-17)
# but the following gives the square root as a complex number
sqrt(-17+0i)


### 13. SEQUENCES
# The following is the same as c(1,2,3,4,5,.......), but easier to type
1:30
# We first give list c(1,2,3,......), then multiply by 2
2*1:15
# Notice the order of precedence here, and the result.
1:10-1
# backwards sequence
30:1
# We demonstrate some variations on the seq command:
seq(2,10)
# Note that the seq command has 5 arguments
?seq
# Try the following
seq(1,41,by=2)
seq(1,41,length.out=5)
y <- 1:5
# so y has length 5
# Here we only write the name of the last argument, since it is out of order:
seq(1,41,along.with=y)
# We could just have written the names of all the arguments:
seq(from=1,to=41,along.with=y)
# We can even use a fractional amount of space in between the numbers.
seq(1,41,by=2.5)
# We can use any phrase which uniquely identifies a parameter from the start,
# for instance, len, length, etc., can be used for length.out
seq(1,41,len=5)
seq(1,41,length=5)
seq(1,41,length.out=5)
# I emphasize how useful it is to read the manual (help) pages:
?seq
seq(1,41,5)
seq(1,41,by=5)
seq(-5,5,by=.2)
seq(length=51,from=-5,by=.2)
# The manual has an ERROR.  It says "The fifth parameter may be
#   named along=vector, which if used must be the only parameter,
#   and creates a sequence 1, 2, ..., length(vector ), or the
#   empty sequence if the vector is empty (as it can be)."
# CORRECTION to the manual:  It is perfectly OK to use along=vector
# with some other parameters.  For instance, consider:
y <- 1:5
seq(1,41,along.with=y)
# this gives the same result as the following two commands:
seq(1,41,by=10)
seq(1,41,leng=5)


####  15. repeat
# OK, on to the rep() command.  First, let's read the manual for it.
?rep
x <- c(10,2,7,62,-5)
# The rep() command repeats an entire vector.
rep(x,times=5)
# We can repeat each element a different number of times, by assigning
#  a vector to the times parameter.  Then we have
# x[1] repeated times[1] times
# x[2] repeated times[2] times
# etc.  Let's try it.
y <- c(3,1,5,3,15)
# We should get 3 copies of x[1], 1 copy of x[2], 5 copies of x[3], etc.
rep(x,y)
# Serkan asked about what happens if the times vector is not long enough.
# Does it get recycled?  We experimented.
# It turns out that the times vector has to be exactly the same length
# as the vector we are operating on.  The y vector is not long enough here:
y <- c(2,6,7)
rep(x,y)
# If the times vector is too long, we also get an error:
y <- c(2,6,7,22,16,3,-7)
rep(x,y)
# So the length of the times vector has to be just the same length
# as the length of the vector we are operating on.  We learn by trying!
# Here's another example that works correctly:
y <- c(2,6,7,22,16)
rep(x,y)
# The moral of story: the times vector must be the same length as x.
# Here's one more example, with the times vector defined inside the rep():
x <- 1:10
rep(x, times=c(2,2,2,2,1,1,1,3,3,3))
# We can also repeat each element of a vector
rep(x,each=5)
# This next example uses "recycling rule".  The vector x, 
# which has length 10, is repeated 2.3 times, so that the result has length 23
rep(x,length=23)
# Now let's run the examples from the rep() command help file.
example(rep)


### 16. LOGICAL VECTORS
x <- c(1,5,3,7,9,2,6,2)
# If we perform a logical statement, like x > 4, then R will grab the
# elements of x that are bigger than 4, and output a TRUE for each element
# The output for the other elements of x will be FALSE.
x > 4
# We can store the resulting vector of Boolean values into a vector.
myboolvalues <- x > 4
myboolvalues
# Let's check to see that we know Boolean addition.  First the "and":
TRUE & TRUE
TRUE & FALSE
# Then the "or":
TRUE | FALSE
FALSE | FALSE
# We also have a negation operator:
a <- TRUE
!a
# We can make Boolean expressions with variables
b <- FALSE
a & b
# HELPFUL HINT: In arithmetic, TRUE evaluates to 1 and FALSE evaluates to 0
# There are 4 TRUE's here, so we can a value of 4.
TRUE + TRUE + FALSE + TRUE + FALSE + TRUE
# Another possible value besides TRUE and FALSE is NA, not available.
# Various functions are available to help us process NA's.
# For example, consider:
x <- c(1, 5, 3, 7, 9, 2, 6, 2, 6, NA, 37, -12, NA, 102)
is.na(x)
# We often use this it this way, to grab the elements that are not NA's:
!is.na(x)
# Someone (Brian, I think) wanted to know if, in the expression is.na,
# is the period "." an operator?  In other words, does the period have 
# some special meaning?
# Unfortunately, no.  The period is just part of the name of the function.
# The period isn't doing anything.  For instance, the period doesn't
# do something special to stuff afterward.... it's just part of the name.
# The following would just give an error:
is.hello(x)
# The "dot" is just part of the name.  Oh well....
# Let's recall the value of x
x
# Other operators are available, like > < <= >= == != ...
# We can compares in x to the values in another vector.
x
x == c(1,10,3,7,18,222,6,0,0,NA,3,-12,38,103)
# R can't tell if NA's match, i.e., if they are equal.  R just doesn't know.
# If we try to compare to a shorter vector, R uses recycyling.
x == c(1,5)
# This is especailly important in the following example.  When the c(1,5,2)
# vector is recycyled as c(1,5,2,1,5,2,1,5,2,1,5,2,1,5), the 6th element
# is "2", which will match the 6th value of x.  This might be unexpected.
x
x == c(1,5,2)
# So recycling is happening in this example with the c(1,5,2)
# In class, I did not want to interrupt the flow of lecture, but
# someone asked about changing the values of Boolean variables to numbers.
# This can be done as follows:
as.numeric(TRUE)
as.numeric(FALSE)
# Some things are not a number, denoted by NaN.  This is different than NA.
# Values could be classified as NaN because we can't tell if they are
# infinite, or finite, or an oscillating expression, etc.
# Here are two examples:
0/0
Inf - Inf
# I think that we just summarized freshman calculus.
# There are some built-in functions for working with NA and NaN
is.na(NA)
is.na(NaN)
is.nan(NA)
is.nan(NaN)
# It is probably worthwhile to read the help pages
?NA
?NaN
# These help pages also describe the is.na() and is.nan() functions.


### 17. CHARACTER VECTORS
# We can store a sequence of characters into a variable, with either
# double quotes or single quotes, for instance:
x <- "hi there"
x
x <- 'hi there'
x
# need backspaces for certain "escape sequences"
# \n for newline
# \t for tab
y <- "hi there\n\n\n mom"
# It is a little bit disappointing that R displays y on just one line.
# We might have expected R to display 3 newlines.  We will talk more
# about this later in the course, when we are doing string processing.
y
y <- "hi there \n \n \n mom"
y
# Make sure that the entire string is inside the quotes.
# Otherwise, there will be an error, for instance:
y <- "hi there"\n
# The help file is in ?Quotes
?Quotes
# One helpful command is paste
?paste
# The following will paste 1 after a, then 2 after b, then 3 after c,
# and the use recycling to past 1 after d and 2 after e:
paste( c("a", "b", "c", "d", "e"), c(1,2,3))
# We can change the separator to HH instead of a single space
paste( c("a", "b", "c", "d", "e"), c(1,2,3), sep= "HH")
# or we can eliminate the space altogether.
paste( c("a", "b", "c", "d", "e"), c(1,2,3), sep= "")


# INDEX VECTORS
# Index vectors are really useful for accessing various parts of a vector
# There are four styles of index vectors.  The first is a logical vector.
x <- c(3,NA,7,0,5,1,8)
!is.na(x)
x > 0
(!is.na(x))&x>0
# This will grab all of the elements of x that are not NA's and that are > 0
x[(!is.na(x))&x>0]
# We could also do the same operation on the vector x + 1 instead:
# [In class, I forgot to add the "+1" throughout, but I fixed it here.
# J.T. gets 10 cents at the registrar for noticing my error.
# Here is the correct version:
(x+1)[(!is.na(x+1))&(x+1)>0]
# The second style of index vectors is to use positive integral quantities:
# This grabs the 3rd component of x:
x[3]
# We could grab the 3rd through the 5th elements of x too:
x[3:5]
# or the first four elements of x
x[1:4]
# many other variations are possible.  The indices don't have to be in order.
# We could grab the 3rd element 5 times in a row, then the 4th element,
# then the 1st element three times in a row, then the 6th element 3 times.
x[c(3,3,3,3,3,4,1,1,1,6,6,6)]
# The third style of index vectors is negative indices.  Negative indices
# are used to grab all of the vector EXCEPT for the negative indices.
# For instance, we could grab all of x except for the 2nd element
x[-2]
# Or grab all of x except for the 6th element
x[-6]
x
# So negative indices are not grabbed, and everything else is grabbed.
# A question:  What happens if we grab something out of bounds?
x[23]
# Answer:  We get a syntax error.  No recycling is performed in such a case.
# The fourth style of index vectors is using a names attribute:
fruit <- c(5,10,1,20)
names(fruit) <- c("orange", "banana", "apple", "peach")
fruit
# We can refer to the elements of fruit by its associated "names" vector now
fruit[c("apple", "orange")]
# We can store the result in our lunch, so that we don't go hungry.
lunch <- fruit[c("apple", "orange")]
# What does our lunch contain?
lunch
# We can use an indexed expression to receive values too.
# For instance, recall the values stored in x:
x
# We could replace all the NA's in the vector x by 0's
is.na(x)
x[is.na(x)]
x[is.na(x)] <- 0
# Here is the new vector x, which no longer contains any NA's.
x
# So each NA got replaced by 0.
# Now let's redefine vector x.
x <- c(3,NA,7,0,5,1,8)
# We could replace the NA's by something else (i.e., the 0 was not special)!
# Let's replace each NA by the value 502.
x[is.na(x)] <- 502
# Now let's give x some negative values too.
x[3] = -9
x[5] = -37
x
# We can check to see which values of x are negative
x < 0
x[x<0]
# and then replace those values with the negative of the values
# Since the negative of a negative is positive, this is equivalent to 
# removing any negatives and just making them positive instead.
x[x<0] <- -x[x<0]
# So the -9 in x[3] gets replaced by 9
# and the -37 in x[5] gets replaced by 37
x

### cat
cat("hello")
print("hello")
stop("error: stop")

#### EXERCISE
Turn to page 14 of the book "usingR" and finish the exercises.

# Now we will type q() to quit R
q()