R codes and Programming: February 2018

Cross tabs in R using {gmodels}

*using retinol data from below link

{gmodels}
CrossTable (retinol$agecat1, retinol$tabac, chisq=TRUE)

| retinol$tabac
retinol$agecat1 | Never | Ex | Current | Row Total |
----------------|-----------|-----------|-----------|-----------|
(0,20] | 1 | 0 | 0 | 1 |
| 0.505 | 0.365 | 0.137 | |
| 1.000 | 0.000 | 0.000 | 0.003 |
| 0.006 | 0.000 | 0.000 | |
| 0.003 | 0.000 | 0.000 | |
----------------|-----------|-----------|-----------|-----------|
(20,30] | 9 | 3 | 5 | 17 |
| 0.033 | 1.656 | 3.094 | |
| 0.529 | 0.176 | 0.294 | 0.054 |
| 0.057 | 0.026 | 0.116 | |
| 0.029 | 0.010 | 0.016 | |
----------------|-----------|-----------|-----------|-----------|
(30,40] | 31 | 29 | 13 | 73 |
| 0.797 | 0.207 | 0.924 | |
| 0.425 | 0.397 | 0.178 | 0.232 |
| 0.197 | 0.252 | 0.302 | |
| 0.098 | 0.092 | 0.041 | |
----------------|-----------|-----------|-----------|-----------|
(40,50] | 46 | 32 | 14 | 92 |
| 0.000 | 0.075 | 0.165 | |
| 0.500 | 0.348 | 0.152 | 0.292 |
| 0.293 | 0.278 | 0.326 | |
| 0.146 | 0.102 | 0.044 | |
----------------|-----------|-----------|-----------|-----------|
(50,60] | 21 | 20 | 5 | 46 |
| 0.162 | 0.612 | 0.261 | |
| 0.457 | 0.435 | 0.109 | 0.146 |
| 0.134 | 0.174 | 0.116 | |
| 0.067 | 0.063 | 0.016 | |
----------------|-----------|-----------|-----------|-----------|
(60,90] | 49 | 31 | 6 | 86 |
| 0.879 | 0.005 | 2.806 | |
| 0.570 | 0.360 | 0.070 | 0.273 |
| 0.312 | 0.270 | 0.140 | |
| 0.156 | 0.098 | 0.019 | |
----------------|-----------|-----------|-----------|-----------|
Column Total | 157 | 115 | 43 | 315 |
| 0.498 | 0.365 | 0.137 | |
----------------|-----------|-----------|-----------|-----------|

Statistics for All Table Factors

Pearson's Chi-squared test
------------------------------------------------------------

Chi^2 = 12.68272 d.f. = 10 p = 0.2419562

Categorizing a variable in R using the cut command

retinol$age
[1] 64 76 38 40 72 40 65 58 35 55 66 40 57 66 66 64 62 75 68 57 56 30 34 53 60 50 62 61 65 71 43 33 74 41
[35] 56 44 37 37 39 37 53 66 58 31 49 75 62 56 69 50 50 72 60 55 43 29 44 48 38 33 56 65 48 66 78 65 72 31
[69] 74 61 83 46 62 56 33 45 41 73 31 50 65 35 42 19 37 44 36 53 69 74 70 56 77 73 43 41 22 44 51 46 25 69
[103] 69 58 77 67 37 47 60 49 83 56 39 41 37 66 55 49 31 64 57 42 40 73 37 43 66 26 45 74 49 41 74 38 83 70
[137] 39 65 72 46 46 51 39 33 49 44 69 52 82 46 25 54 70 35 46 70 71 59 42 46 37 39 43 23 45 63 44 43 33 35
[171] 50 50 59 38 60 54 71 48 49 27 27 75 45 35 56 41 48 48 32 38 52 40 44 70 55 48 38 49 36 64 49 49 43 33
[205] 38 36 33 47 53 41 36 32 57 42 37 52 45 42 42 41 41 26 70 44 46 22 46 32 27 52 42 34 36 59 75 43 56 48
[239] 34 37 41 74 73 53 74 77 39 29 37 47 64 36 42 64 38 39 40 29 71 45 63 46 75 46 44 24 32 66 43 73 54 44
[273] 55 70 32 33 49 69 38 34 49 40 41 62 28 78 67 51 29 36 43 55 54 32 65 41 60 73 71 47 35 34 41 33 73 67
[307] 66 44 36 48 46 45 49 31 45

retinol$agecat1<-cut(retinol$age, c(0,20,30,40,50,60,90), right=T, labels=c("0 to 20", "21 to 30", "31 to 40", "41 to 50", "51 to 60", "61 to 90"))

[1] (60,90] (60,90] (30,40] (30,40] (60,90] (30,40] (60,90] (50,60] (30,40] (50,60] (60,90] (30,40]
[13] (50,60] (60,90] (60,90] (60,90] (60,90] (60,90] (60,90] (50,60] (50,60] (20,30] (30,40] (50,60]
[25] (50,60] (40,50] (60,90] (60,90] (60,90] (60,90] (40,50] (30,40] (60,90] (40,50] (50,60] (40,50]
[37] (30,40] (30,40] (30,40] (30,40] (50,60] (60,90] (50,60] (30,40] (40,50] (60,90] (60,90] (50,60]
[49] (60,90] (40,50] (40,50] (60,90] (50,60] (50,60] (40,50] (20,30] (40,50] (40,50] (30,40] (30,40]
[61] (50,60] (60,90] (40,50] (60,90] (60,90] (60,90] (60,90] (30,40] (60,90] (60,90] (60,90] (40,50]
[73] (60,90] (50,60] (30,40] (40,50] (40,50] (60,90] (30,40] (40,50] (60,90] (30,40] (40,50] (0,20]
[85] (30,40] (40,50] (30,40] (50,60] (60,90] (60,90] (60,90] (50,60] (60,90] (60,90] (40,50] (40,50]
[97] (20,30] (40,50] (50,60] (40,50] (20,30] (60,90] (60,90] (50,60] (60,90] (60,90] (30,40] (40,50]
[109] (50,60] (40,50] (60,90] (50,60] (30,40] (40,50] (30,40] (60,90] (50,60] (40,50] (30,40] (60,90]
[121] (50,60] (40,50] (30,40] (60,90] (30,40] (40,50] (60,90] (20,30] (40,50] (60,90] (40,50] (40,50]
[133] (60,90] (30,40] (60,90] (60,90] (30,40] (60,90] (60,90] (40,50] (40,50] (50,60] (30,40] (30,40]
[145] (40,50] (40,50] (60,90] (50,60] (60,90] (40,50] (20,30] (50,60] (60,90] (30,40] (40,50] (60,90]
[157] (60,90] (50,60] (40,50] (40,50] (30,40] (30,40] (40,50] (20,30] (40,50] (60,90] (40,50] (40,50]
[169] (30,40] (30,40] (40,50] (40,50] (50,60] (30,40] (50,60] (50,60] (60,90] (40,50] (40,50] (20,30]
[181] (20,30] (60,90] (40,50] (30,40] (50,60] (40,50] (40,50] (40,50] (30,40] (30,40] (50,60] (30,40]
[193] (40,50] (60,90] (50,60] (40,50] (30,40] (40,50] (30,40] (60,90] (40,50] (40,50] (40,50] (30,40]
[205] (30,40] (30,40] (30,40] (40,50] (50,60] (40,50] (30,40] (30,40] (50,60] (40,50] (30,40] (50,60]
[217] (40,50] (40,50] (40,50] (40,50] (40,50] (20,30] (60,90] (40,50] (40,50] (20,30] (40,50] (30,40]
[229] (20,30] (50,60] (40,50] (30,40] (30,40] (50,60] (60,90] (40,50] (50,60] (40,50] (30,40] (30,40]
[241] (40,50] (60,90] (60,90] (50,60] (60,90] (60,90] (30,40] (20,30] (30,40] (40,50] (60,90] (30,40]
[253] (40,50] (60,90] (30,40] (30,40] (30,40] (20,30] (60,90] (40,50] (60,90] (40,50] (60,90] (40,50]
[265] (40,50] (20,30] (30,40] (60,90] (40,50] (60,90] (50,60] (40,50] (50,60] (60,90] (30,40] (30,40]
[277] (40,50] (60,90] (30,40] (30,40] (40,50] (30,40] (40,50] (60,90] (20,30] (60,90] (60,90] (50,60]
[289] (20,30] (30,40] (40,50] (50,60] (50,60] (30,40] (60,90] (40,50] (50,60] (60,90] (60,90] (40,50]
[301] (30,40] (30,40] (40,50] (30,40] (60,90] (60,90] (60,90] (40,50] (30,40] (40,50] (40,50] (40,50]

[313] (40,50] (30,40] (40,50]

describe (retinol$agecat1)

Factor

x (40,50] (60,90] (30,40] (50,60] (20,30] (0,20]

Count 92.00 86.0 73.00 46.0 17.0 1.00

Percent 29.21 27.3 23.17 14.6 5.4 0.32

Mode (40,50]

Density plots and histograms

Using retinol data
{ggplot2}

qplot (cholesterol, data=retinol, geom="histogram", binwidth=20)

qplot (cholesterol, data=retinol, geom="density")

Splitting density plots and histograms by different categories of a selected variable
qplot (cholesterol, data=retinol, geom="density", colour=tabac)

qplot (cholesterol, data=retinol, geom="histogram", binwidth=20, fill=tabac)

Regression model with variables with different lengths

Sometimes when you are running a regression model with variables that have different lengths, r will prompt with the following error message:

Error in model.frame.default(formula = bmi ~ age, drop.unused.levels = TRUE) :
variable lengths differ (found for 'age')

This error message is mainly due to the fact that either bmi or age has different number of NA observations. The first step is to remove the NAs from the data frame at hand

Remove NAs

df2=df1[complete.cases(df1),]

Box plots using {ggplot2} {ggpubr} and {car}

Download retinol data

> retinol=read.csv2("C:/Users/ke05/Documents/My Dropbox/Biostat course/EPHD 312/Spring 2017/Project 1/TPretinol.csv")
> retinol$tabac<-factor(tabac, levels=c(1,2,3), labels=c("Never", "Ex", "Current"))
# Plot weight by group and color by group
library("ggpubr")
> ggboxplot(retinol, x = "tabac", y = "age", color = "tabac", palette = c("#00AFBB", "#E7B800", "#FC4E07"), order = c("Never", "Ex", "Current"), ylab = "age", xlab = "Smoking Status")

#using the library {car}

> boxplot (retdiet~tabac, xlab="smoking status", ylab="concentration", data=retinol)

#How to add a row to an existing dataframe

sbp=c(144, 138, 162, 170, 158)

age=c(39, 45, 65, 67, 67)

#start by converting the listings into a data frame

df=data.frame(sbp,age)

sbp age

1 144 39

2 138 45

3 162 65

4 170 67

5 158 67

df1=rbind(df, c(200,800))

df1

sbp age

1 144 39

2 138 45

3 162 65

4 170 67

5 158 67

6 200 800

Working with factors and numbers

#working with factors

>data=c(1,2,3,2,2,3,1,1,3,4,4,3)

>fdata=factor(data)

>fdata

>table (fdata)

#assigning label to a factor variable

>rdata=factor(data,labels=c('I','II','III','IV'))

>rdata

>table (rdata)

#Example using retinol data>retinol$tabac=factor(retinol$tabac, labels=c("Never", "Ex", "Current"))

#transforming an object into factor is necessary so that R treats it as a categorical variable

varname=as.factor(varname)

#transforming string to numeric

names=c("donald", "mickey", "pluto")

names_fact=as.factor(names)

names_fact

names_num=as.numeric(names_fact)

names_num

Basic R

#Creating objects in R

a=c(1,2,3,4)

b=c(4,5,6,1)

c=a+b

d=seq(1,20,2)

x=c(eleven=11, twelve=12, thirteen=13, fourteen=14)

#changing the name of elements of the vector

names (x)[1:2]=c("onze", "douze")

mode (x)

class (x)

z=c(1,2,3)

#combining x and z into 1 vector

all=c(y,x,z)

all

#using list we can create objects of varying dimensions and types in a single R object

mylist<-list(c(1,2,3), "dog", TRUE, c(9,8,7))

mylist

sapply(mylist,mode)

mylist[[2]]

mylist[[1]]

#you can try it on any other object

x[2]

x[1]

#working with matrices

matrix=matrix(c(1,2,3, 11,12,13),nrow=3, ncol=2, byrow=T, dimnames=list(c("r1", "r2", "r3"), c("c1", "c2")))

matrix

names_matrix=list(c("row1", "row2", "row3"), c("col1", "col2"))

matrix1=matrix(c(1,2,3, 11,12,13),nrow=3, ncol=2, byrow=T, dimnames=names_matrix)

sbp=c(144, 138, 162, 170, 158, 162, 140, 128, 135, 116, 136, 120, 160, 144, 125, 220, 145, 142, 124, 154, 150, 110, 130, 114, 124, 142, 120, 158, 130, 175)

age=c(39, 45, 65, 67, 67, 64, 59, 42, 45, 20, 36, 39, 44, 63, 25, 47, 47, 46, 42, 56, 56, 34, 48, 17, 19, 50, 21, 33, 29, 69)

matrix_age=matrix(age, nrow=5, ncol=6, byrow=T)

matrix_sbp=matrix(sbp, nrow=5, ncol=6, byrow=T)

matrix_age

matrix_sbp

#creating an identity matrix of size 5x5

n=c(5)

I=matrix(0,nrow=n, ncol=n)

I[row(I)==col(I)]=1

Intro

#reading R data

mydata=read.csv("filepath.csv")
example:
retinol=read.csv2("C:/Users/ke05/Documents/My Dropbox/Biostat course/EPHD 312/Spring 2017/Project 1/TPretinol.csv")

Stata 13 dataset

library("readstata13")

dataset_name<-read.dta13(“~/filepath/…/filename.dta”, convert.factors = TRUE, generate.factors = FALSE, encoding = "UTF-8", fromEncoding = NULL, convert.underscore = FALSE, missing.type = FALSE, convert.dates = TRUE, replace.strl = TRUE, add.rownames = FALSE, nonint.factors = TRUE)

dataset_name<-read.dta13(“~/filepath/…/filename.dta”)

*note: you can use either read.csv or read.csv2

R codes and Programming