Cross tabs in R using {gmodels}

*using retinol data from below link

{gmodels}
CrossTable (retinol$agecat1, retinol$tabac, chisq=TRUE)

                | retinol$tabac 
retinol$agecat1 |     Never |        Ex |   Current | Row Total | 
----------------|-----------|-----------|-----------|-----------|
         (0,20] |         1 |         0 |         0 |         1 | 
                |     0.505 |     0.365 |     0.137 |           | 
                |     1.000 |     0.000 |     0.000 |     0.003 | 
                |     0.006 |     0.000 |     0.000 |           | 
                |     0.003 |     0.000 |     0.000 |           | 
----------------|-----------|-----------|-----------|-----------|
        (20,30] |         9 |         3 |         5 |        17 | 
                |     0.033 |     1.656 |     3.094 |           | 
                |     0.529 |     0.176 |     0.294 |     0.054 | 
                |     0.057 |     0.026 |     0.116 |           | 
                |     0.029 |     0.010 |     0.016 |           | 
----------------|-----------|-----------|-----------|-----------|
        (30,40] |        31 |        29 |        13 |        73 | 
                |     0.797 |     0.207 |     0.924 |           | 
                |     0.425 |     0.397 |     0.178 |     0.232 | 
                |     0.197 |     0.252 |     0.302 |           | 
                |     0.098 |     0.092 |     0.041 |           | 
----------------|-----------|-----------|-----------|-----------|
        (40,50] |        46 |        32 |        14 |        92 | 
                |     0.000 |     0.075 |     0.165 |           | 
                |     0.500 |     0.348 |     0.152 |     0.292 | 
                |     0.293 |     0.278 |     0.326 |           | 
                |     0.146 |     0.102 |     0.044 |           | 
----------------|-----------|-----------|-----------|-----------|
        (50,60] |        21 |        20 |         5 |        46 | 
                |     0.162 |     0.612 |     0.261 |           | 
                |     0.457 |     0.435 |     0.109 |     0.146 | 
                |     0.134 |     0.174 |     0.116 |           | 
                |     0.067 |     0.063 |     0.016 |           | 
----------------|-----------|-----------|-----------|-----------|
        (60,90] |        49 |        31 |         6 |        86 | 
                |     0.879 |     0.005 |     2.806 |           | 
                |     0.570 |     0.360 |     0.070 |     0.273 | 
                |     0.312 |     0.270 |     0.140 |           | 
                |     0.156 |     0.098 |     0.019 |           | 
----------------|-----------|-----------|-----------|-----------|
   Column Total |       157 |       115 |        43 |       315 | 
                |     0.498 |     0.365 |     0.137 |           | 
----------------|-----------|-----------|-----------|-----------|


Statistics for All Table Factors


Pearson's Chi-squared test 
------------------------------------------------------------

Chi^2 =  12.68272     d.f. =  10     p =  0.2419562 

Categorizing a variable in R using the cut command



retinol$age
  [1] 64 76 38 40 72 40 65 58 35 55 66 40 57 66 66 64 62 75 68 57 56 30 34 53 60 50 62 61 65 71 43 33 74 41
 [35] 56 44 37 37 39 37 53 66 58 31 49 75 62 56 69 50 50 72 60 55 43 29 44 48 38 33 56 65 48 66 78 65 72 31
 [69] 74 61 83 46 62 56 33 45 41 73 31 50 65 35 42 19 37 44 36 53 69 74 70 56 77 73 43 41 22 44 51 46 25 69
[103] 69 58 77 67 37 47 60 49 83 56 39 41 37 66 55 49 31 64 57 42 40 73 37 43 66 26 45 74 49 41 74 38 83 70
[137] 39 65 72 46 46 51 39 33 49 44 69 52 82 46 25 54 70 35 46 70 71 59 42 46 37 39 43 23 45 63 44 43 33 35
[171] 50 50 59 38 60 54 71 48 49 27 27 75 45 35 56 41 48 48 32 38 52 40 44 70 55 48 38 49 36 64 49 49 43 33
[205] 38 36 33 47 53 41 36 32 57 42 37 52 45 42 42 41 41 26 70 44 46 22 46 32 27 52 42 34 36 59 75 43 56 48
[239] 34 37 41 74 73 53 74 77 39 29 37 47 64 36 42 64 38 39 40 29 71 45 63 46 75 46 44 24 32 66 43 73 54 44
[273] 55 70 32 33 49 69 38 34 49 40 41 62 28 78 67 51 29 36 43 55 54 32 65 41 60 73 71 47 35 34 41 33 73 67
[307] 66 44 36 48 46 45 49 31 45

retinol$agecat1<-cut(retinol$age, c(0,20,30,40,50,60,90), right=T, labels=c("0 to 20", "21 to 30", "31 to 40", "41 to 50", "51 to 60", "61 to 90"))

[1] (60,90] (60,90] (30,40] (30,40] (60,90] (30,40] (60,90] (50,60] (30,40] (50,60] (60,90] (30,40]
 [13] (50,60] (60,90] (60,90] (60,90] (60,90] (60,90] (60,90] (50,60] (50,60] (20,30] (30,40] (50,60]
 [25] (50,60] (40,50] (60,90] (60,90] (60,90] (60,90] (40,50] (30,40] (60,90] (40,50] (50,60] (40,50]
 [37] (30,40] (30,40] (30,40] (30,40] (50,60] (60,90] (50,60] (30,40] (40,50] (60,90] (60,90] (50,60]
 [49] (60,90] (40,50] (40,50] (60,90] (50,60] (50,60] (40,50] (20,30] (40,50] (40,50] (30,40] (30,40]
 [61] (50,60] (60,90] (40,50] (60,90] (60,90] (60,90] (60,90] (30,40] (60,90] (60,90] (60,90] (40,50]
 [73] (60,90] (50,60] (30,40] (40,50] (40,50] (60,90] (30,40] (40,50] (60,90] (30,40] (40,50] (0,20] 
 [85] (30,40] (40,50] (30,40] (50,60] (60,90] (60,90] (60,90] (50,60] (60,90] (60,90] (40,50] (40,50]
 [97] (20,30] (40,50] (50,60] (40,50] (20,30] (60,90] (60,90] (50,60] (60,90] (60,90] (30,40] (40,50]
[109] (50,60] (40,50] (60,90] (50,60] (30,40] (40,50] (30,40] (60,90] (50,60] (40,50] (30,40] (60,90]
[121] (50,60] (40,50] (30,40] (60,90] (30,40] (40,50] (60,90] (20,30] (40,50] (60,90] (40,50] (40,50]
[133] (60,90] (30,40] (60,90] (60,90] (30,40] (60,90] (60,90] (40,50] (40,50] (50,60] (30,40] (30,40]
[145] (40,50] (40,50] (60,90] (50,60] (60,90] (40,50] (20,30] (50,60] (60,90] (30,40] (40,50] (60,90]
[157] (60,90] (50,60] (40,50] (40,50] (30,40] (30,40] (40,50] (20,30] (40,50] (60,90] (40,50] (40,50]
[169] (30,40] (30,40] (40,50] (40,50] (50,60] (30,40] (50,60] (50,60] (60,90] (40,50] (40,50] (20,30]
[181] (20,30] (60,90] (40,50] (30,40] (50,60] (40,50] (40,50] (40,50] (30,40] (30,40] (50,60] (30,40]
[193] (40,50] (60,90] (50,60] (40,50] (30,40] (40,50] (30,40] (60,90] (40,50] (40,50] (40,50] (30,40]
[205] (30,40] (30,40] (30,40] (40,50] (50,60] (40,50] (30,40] (30,40] (50,60] (40,50] (30,40] (50,60]
[217] (40,50] (40,50] (40,50] (40,50] (40,50] (20,30] (60,90] (40,50] (40,50] (20,30] (40,50] (30,40]
[229] (20,30] (50,60] (40,50] (30,40] (30,40] (50,60] (60,90] (40,50] (50,60] (40,50] (30,40] (30,40]
[241] (40,50] (60,90] (60,90] (50,60] (60,90] (60,90] (30,40] (20,30] (30,40] (40,50] (60,90] (30,40]
[253] (40,50] (60,90] (30,40] (30,40] (30,40] (20,30] (60,90] (40,50] (60,90] (40,50] (60,90] (40,50]
[265] (40,50] (20,30] (30,40] (60,90] (40,50] (60,90] (50,60] (40,50] (50,60] (60,90] (30,40] (30,40]
[277] (40,50] (60,90] (30,40] (30,40] (40,50] (30,40] (40,50] (60,90] (20,30] (60,90] (60,90] (50,60]
[289] (20,30] (30,40] (40,50] (50,60] (50,60] (30,40] (60,90] (40,50] (50,60] (60,90] (60,90] (40,50]
[301] (30,40] (30,40] (40,50] (30,40] (60,90] (60,90] (60,90] (40,50] (30,40] (40,50] (40,50] (40,50]

[313] (40,50] (30,40] (40,50]


describe (retinol$agecat1)

Factor 
         
x         (40,50] (60,90] (30,40] (50,60] (20,30] (0,20]
  Count     92.00    86.0   73.00    46.0    17.0   1.00
  Percent   29.21    27.3   23.17    14.6     5.4   0.32
Mode (40,50] 
Mode (40,50]

Density plots and histograms

Using retinol data
{ggplot2}

qplot (cholesterol, data=retinol, geom="histogram", binwidth=20)



qplot (cholesterol, data=retinol, geom="density")

Splitting density plots and histograms by different categories of a selected variable 
qplot (cholesterol, data=retinol, geom="density", colour=tabac)

qplot (cholesterol, data=retinol, geom="histogram", binwidth=20, fill=tabac)

Regression model with variables with different lengths

Sometimes when you are running a regression model with variables that have different lengths, r will prompt with the following error message:


Error in model.frame.default(formula = bmi ~ age, drop.unused.levels = TRUE) : 
  variable lengths differ (found for 'age')

This error message is mainly due to the fact that either bmi or age has different number of NA observations. The first step is to remove the NAs from the data frame at hand

Remove NAs

df2=df1[complete.cases(df1),]



Box plots using {ggplot2} {ggpubr} and {car}



Download retinol data

> retinol=read.csv2("C:/Users/ke05/Documents/My Dropbox/Biostat course/EPHD 312/Spring 2017/Project 1/TPretinol.csv")
> retinol$tabac<-factor(tabac, levels=c(1,2,3), labels=c("Never", "Ex", "Current"))
# Plot weight by group and color by group
library("ggpubr")
> ggboxplot(retinol, x = "tabac", y = "age", color = "tabac", palette = c("#00AFBB", "#E7B800", "#FC4E07"), order = c("Never", "Ex", "Current"), ylab = "age", xlab = "Smoking Status")



#using the library {car}

> boxplot (retdiet~tabac, xlab="smoking status", ylab="concentration", data=retinol)

#How to add a row to an existing dataframe



sbp=c(144, 138, 162, 170, 158)
age=c(39, 45, 65, 67, 67)

#start by converting the listings into a data frame
df=data.frame(sbp,age)
df
 sbp age

1 144  39
2 138  45
3 162  65
4 170  67
5 158  67


df1=rbind(df, c(200,800))
df1

sbp age

1 144  39
2 138  45
3 162  65
4 170  67
5 158  67
6 200 800




Working with factors and numbers


#working with factors
>data=c(1,2,3,2,2,3,1,1,3,4,4,3)
>fdata=factor(data)
>fdata
>table (fdata)

#assigning label to a factor variable
>rdata=factor(data,labels=c('I','II','III','IV'))
>rdata
>table (rdata)
#Example using retinol data>retinol$tabac=factor(retinol$tabac, labels=c("Never", "Ex", "Current"))


#transforming an object into factor is necessary so that R treats it as a categorical variable

varname=as.factor(varname)


#transforming string to numeric

names=c("donald", "mickey", "pluto")
names_fact=as.factor(names)
names_fact
names_num=as.numeric(names_fact)
names_num



Basic R


#Creating objects in R
a=c(1,2,3,4)
b=c(4,5,6,1)
c=a+b
c
d=seq(1,20,2)
d

x=c(eleven=11, twelve=12, thirteen=13, fourteen=14)
x
#changing the name of elements of the vector
names (x)[1:2]=c("onze", "douze")
x
mode (x)
class (x)
z=c(1,2,3)

#combining x and z into 1 vector
all=c(y,x,z)
all

#using list we can create objects of varying dimensions and types in a single R object
mylist<-list(c(1,2,3), "dog", TRUE, c(9,8,7))
mylist
sapply(mylist,mode)
 mylist[[2]]
 mylist[[1]]

#you can try it on any other object
x[2]
x[1]

#working with matrices
matrix=matrix(c(1,2,3, 11,12,13),nrow=3, ncol=2, byrow=T, dimnames=list(c("r1", "r2", "r3"), c("c1", "c2")))

matrix

names_matrix=list(c("row1", "row2", "row3"), c("col1", "col2"))

matrix1=matrix(c(1,2,3, 11,12,13),nrow=3, ncol=2, byrow=T, dimnames=names_matrix)

sbp=c(144, 138, 162, 170, 158, 162, 140, 128, 135, 116, 136, 120, 160, 144, 125, 220, 145, 142, 124, 154, 150, 110, 130, 114, 124, 142, 120, 158, 130, 175)

age=c(39, 45, 65, 67, 67, 64, 59, 42, 45, 20, 36, 39, 44, 63, 25, 47, 47, 46, 42, 56, 56, 34, 48, 17, 19, 50, 21, 33, 29, 69)

matrix_age=matrix(age, nrow=5, ncol=6, byrow=T)
matrix_sbp=matrix(sbp, nrow=5, ncol=6, byrow=T)
matrix_age
matrix_sbp

#creating an identity matrix of size 5x5
n=c(5)
I=matrix(0,nrow=n, ncol=n)
I[row(I)==col(I)]=1
I

Intro

#reading R data


mydata=read.csv("filepath.csv")
example:
retinol=read.csv2("C:/Users/ke05/Documents/My Dropbox/Biostat course/EPHD 312/Spring 2017/Project 1/TPretinol.csv")

Stata 13 dataset 
library("readstata13")

dataset_name<-read.dta13(“~/filepath/…/filename.dta”, convert.factors = TRUE, generate.factors = FALSE, encoding = "UTF-8", fromEncoding = NULL, convert.underscore = FALSE, missing.type = FALSE, convert.dates = TRUE, replace.strl = TRUE, add.rownames = FALSE, nonint.factors = TRUE)


dataset_name<-read.dta13(“~/filepath/…/filename.dta”)


*note: you can use either read.csv or read.csv2

Introduction to the Analysis of Survival Data in the Presence of Competing Risks

 https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4741409/