Last Updated: November 21, 2025
Basic Data Types
| Type | Example |
|---|---|
| Numeric |
x <- 42.5
|
| Integer |
y <- 42L
|
| Character |
name <- "John"
|
| Logical |
flag <- TRUE
|
| Complex |
z <- 3 + 2i
|
| Check type |
class(x) # typeof(x) for detailed type
|
| Convert to numeric |
as.numeric("42")
|
| Convert to character |
as.character(42)
|
Data Structures
| Structure | Example |
|---|---|
| Vector |
v <- c(1, 2, 3, 4, 5)
|
| List |
lst <- list(name="John", age=30, scores=c(85,90,92))
|
| Matrix |
m <- matrix(1:9, nrow=3, ncol=3)
|
| Data frame |
df <- data.frame(name=c("A","B"), value=c(1,2))
|
| Factor |
f <- factor(c("low","med","high"), levels=c("low","med","high"))
|
| Access vector |
v[1] # First element (1-indexed)
|
| Access list |
lst$name # or lst[[1]]
|
| Access data frame |
df$name # or df[1,] or df[,1]
|
dplyr Verbs (Data Manipulation)
| Function | Description |
|---|---|
| select() |
df %>% select(name, age) # Choose columns
|
| filter() |
df %>% filter(age > 25) # Filter rows
|
| arrange() |
df %>% arrange(age) # Sort ascending
|
| arrange(desc()) |
df %>% arrange(desc(age)) # Sort descending
|
| mutate() |
df %>% mutate(age_10 = age + 10) # Add column
|
| summarise() |
df %>% summarise(mean_age = mean(age))
|
| group_by() |
df %>% group_by(category) %>% summarise(avg = mean(value))
|
| count() |
df %>% count(category) # Count occurrences
|
| distinct() |
df %>% distinct(name) # Unique values
|
| rename() |
df %>% rename(new_name = old_name)
|
| slice() |
df %>% slice(1:5) # First 5 rows
|
| pull() |
df %>% pull(age) # Extract column as vector
|
ggplot2 Basics
| Layer | Example |
|---|---|
| Initialize plot |
ggplot(data = df, aes(x = var1, y = var2))
|
| Scatter plot |
+ geom_point()
|
| Line plot |
+ geom_line()
|
| Bar plot |
+ geom_bar(stat="identity")
|
| Histogram |
+ geom_histogram(binwidth=5)
|
| Box plot |
+ geom_boxplot()
|
| Add title |
+ labs(title="My Plot", x="X Label", y="Y Label")
|
| Change theme |
+ theme_minimal() # theme_bw(), theme_classic()
|
| Color by variable |
aes(color = category)
|
| Facet wrap |
+ facet_wrap(~category)
|
| Save plot |
ggsave("plot.png", width=8, height=6)
|
Statistical Functions
| Function | Description |
|---|---|
| mean() |
mean(c(1,2,3,4,5)) # Average
|
| median() |
median(c(1,2,3,4,5))
|
| sd() |
sd(values) # Standard deviation
|
| var() |
var(values) # Variance
|
| min() / max() |
min(values) # Minimum value
|
| sum() |
sum(c(1,2,3,4,5)) # Sum all values
|
| range() |
range(values) # Min and max
|
| quantile() |
quantile(values, c(0.25, 0.75)) # Quartiles
|
| IQR() |
IQR(values) # Interquartile range
|
| cor() |
cor(x, y) # Correlation coefficient
|
| cov() |
cov(x, y) # Covariance
|
| scale() |
scale(values) # Standardize (z-scores)
|
Statistical Tests
| Test | Code |
|---|---|
| T-test |
t.test(x, y) # Two-sample t-test
|
| One-sample t-test |
t.test(x, mu=0) # Test against mean
|
| Paired t-test |
t.test(x, y, paired=TRUE)
|
| Chi-square test |
chisq.test(table(x, y))
|
| ANOVA |
aov(value ~ group, data=df)
|
| Linear regression |
lm(y ~ x, data=df)
|
| Shapiro-Wilk test |
shapiro.test(values) # Test normality
|
| Wilcoxon test |
wilcox.test(x, y) # Non-parametric alternative
|
Data Import/Export
| Operation | Code |
|---|---|
| Read CSV |
df <- read.csv("data.csv")
|
| Read CSV (readr) |
df <- read_csv("data.csv") # Faster, better
|
| Write CSV |
write.csv(df, "output.csv", row.names=FALSE)
|
| Read Excel |
library(readxl)
|
| Write Excel |
library(writexl)
|
| Read RDS |
df <- readRDS("data.rds") # R native format
|
| Write RDS |
saveRDS(df, "data.rds")
|
String Manipulation (stringr)
| Function | Example |
|---|---|
| str_length() |
str_length("hello") # Returns 5
|
| str_to_upper() |
str_to_upper("hello") # "HELLO"
|
| str_to_lower() |
str_to_lower("HELLO") # "hello"
|
| str_trim() |
str_trim(" hello ") # "hello"
|
| str_replace() |
str_replace("hello", "l", "r") # "herlo"
|
| str_replace_all() |
str_replace_all("hello", "l", "r") # "herro"
|
| str_detect() |
str_detect("hello", "ell") # TRUE
|
| str_subset() |
str_subset(c("apple","banana"), "a") # Both
|
| str_split() |
str_split("a,b,c", ",") # List of vectors
|
Control Flow
| Statement | Example |
|---|---|
| If statement |
if(x > 5) { print("Big") }
|
| If-else |
if(x > 5) { print("Big") } else { print("Small") }
|
| If-else if |
if(x > 10) { } else if(x > 5) { } else { }
|
| For loop |
for(i in 1:10) { print(i) }
|
| While loop |
while(x < 10) { x <- x + 1 }
|
| Vectorized if |
ifelse(x > 5, "Big", "Small")
|
Apply Functions (Avoiding Loops)
| Function | Use Case |
|---|---|
| apply() |
apply(matrix, 1, sum) # Apply to rows (1) or cols (2)
|
| lapply() |
lapply(list, function(x) x*2) # Returns list
|
| sapply() |
sapply(list, function(x) x*2) # Returns vector
|
| mapply() |
mapply(sum, list1, list2) # Multivariate apply
|
| tapply() |
tapply(values, groups, mean) # Apply by group
|
Tidyr Functions (Reshaping Data)
| Function | Description |
|---|---|
| pivot_longer() |
df %>% pivot_longer(cols=c(col1,col2), names_to="var", values_to="val")
|
| pivot_wider() |
df %>% pivot_wider(names_from=var, values_from=val)
|
| separate() |
df %>% separate(col, into=c("part1","part2"), sep="-")
|
| unite() |
df %>% unite("new_col", col1, col2, sep="-")
|
| drop_na() |
df %>% drop_na() # Remove rows with NA
|
| replace_na() |
df %>% replace_na(list(col1=0))
|
💡 Pro Tips:
- Use <- for assignment (not =) to follow R conventions
- Install tidyverse with install.packages("tidyverse") for dplyr, ggplot2, and more
- Use %>% pipe operator to chain operations for readable code
- View(df) opens data frame in spreadsheet-like viewer
- Use head(df) and tail(df) to preview data
- str(df) shows structure of data frame
- summary(df) provides statistical summary
- Use RStudio for best R programming experience