📜  R 编程中的 dplyr 包

📅  最后修改于: 2022-05-13 01:55:35.256000             🧑  作者: Mango

R 编程中的 dplyr 包

R 编程语言的 dplyr 包是一种数据操作结构,它提供了一组统一的动词,有助于解决最常见的数据操作障碍。

R 中的 dplyr 包以更快、更简单的方式执行以下步骤:

  • 通过限制选择,现在可以更多地关注数据操作的困难。
  • 有简单的“动词”,用于处理每个常见数据操作的函数,并且可以更快地将想法转换为代码。
  • 有有价值的后端,因此计算机的等待时间减少了。

重要的动词功能

dplyr 包提供了可用于数据操作的各种重要功能。这些都是:

  • filter()函数:用于选择案例并使用它们的值作为这样做的基础。
R
# Create a data frame with missing data
d < - data.frame(name=c("Abhi", "Bhavesh",
                        "Chaman", "Dimri"),
                 age=c(7, 5, 9, 16),
                 ht=c(46, NA, NA, 69),
                 school=c("yes", "yes", "no", "no"))
d
 
# Finding rows with NA value
d % > % filter(is.na(ht))
 
# Finding rows with no NA value
d % > % filter(! is.na(ht))


R
# Create a data frame with missing data
d <- data.frame( name = c("Abhi", "Bhavesh", "Chaman", "Dimri"),
                 age = c(7, 5, 9, 16),
                 ht = c(46, NA, NA, 69),
                 school = c("yes", "yes", "no", "no") )
 
# Arranging name according to the age
d.name<- arrange(d, age)
print(d.name)


R
# Create a data frame with missing data
d < - data.frame(name=c("Abhi", "Bhavesh",
                        "Chaman", "Dimri"),
                 age=c(7, 5, 9, 16),
                 ht=c(46, NA, NA, 69),
                 school=c("yes", "yes", "no", "no"))
 
# startswith() function to print only ht data
select(d, starts_with("ht"))
 
# -startswith() function to print
# everything except ht data
select(d, -starts_with("ht"))
 
# Printing column 1 to 2
select(d, 1: 2)
 
# Printing data of column
# heading containing 'a'
select(d, contains("a"))
 
# Printing data of column
# heading which matches 'na'
select(d, matches("na"))


R
# Create a data frame with missing data
d <- data.frame( name = c("Abhi", "Bhavesh",
                          "Chaman", "Dimri"),
                 age = c(7, 5, 9, 16),
                 ht = c(46, NA, NA, 69),
                 school = c("yes", "yes", "no", "no") )
 
# Calculating a variable x3 which is sum of height
# and age printing with ht and age
mutate(d, x3 = ht + age)
 
# Calculating a variable x3 which is sum of height
# and age printing without ht and age
transmute(d, x3 = ht + age)


R
# Create a data frame with missing data
d <- data.frame( name = c("Abhi", "Bhavesh",
                          "Chaman", "Dimri"),
                 age = c(7, 5, 9, 16),
                 ht = c(46, NA, NA, 69),
                 school = c("yes", "yes", "no", "no") )
 
# Calculating mean of age
summarise(d, mean = mean(age))
 
# Calculating min of age
summarise(d, med = min(age))
 
# Calculating max of age
summarise(d, med = max(age))
 
# Calculating median of age
summarise(d, med = median(age))


R
# Create a data frame with missing data
d <- data.frame( name = c("Abhi", "Bhavesh",
                          "Chaman", "Dimri"),
                 age = c(7, 5, 9, 16),
                 ht = c(46, NA, NA, 69),
                 school = c("yes", "yes", "no", "no") )
 
# Printing three rows
sample_n(d, 3)
 
# Printing 50 % of the rows
sample_frac(d, 0.50)


输出:

# A tibble: 4 x 4
  name      age    ht school
        
1 Abhi        7    46 yes   
2 Bhavesh     5    NA yes   
3 Chaman      9    NA no    
4 Dimri      16    69 no

# A tibble: 2 x 4
  name      age    ht school
        
1 Bhavesh     5    NA yes   
2 Chaman      9    NA no

# A tibble: 2 x 4
  name    age    ht school
      
1 Abhi      7    46 yes   
2 Dimri    16    69 no
  • 安排():用于重新排序案例。

R

# Create a data frame with missing data
d <- data.frame( name = c("Abhi", "Bhavesh", "Chaman", "Dimri"),
                 age = c(7, 5, 9, 16),
                 ht = c(46, NA, NA, 69),
                 school = c("yes", "yes", "no", "no") )
 
# Arranging name according to the age
d.name<- arrange(d, age)
print(d.name)

输出:

# A tibble: 4 x 4
  name      age    ht school
        
1 Bhavesh     5    NA yes   
2 Abhi        7    46 yes   
3 Chaman      9    NA no    
4 Dimri      16    69 no   
  • select() 和 rename():用于选择变量并使用它们的名称作为这样做的基础。

R

# Create a data frame with missing data
d < - data.frame(name=c("Abhi", "Bhavesh",
                        "Chaman", "Dimri"),
                 age=c(7, 5, 9, 16),
                 ht=c(46, NA, NA, 69),
                 school=c("yes", "yes", "no", "no"))
 
# startswith() function to print only ht data
select(d, starts_with("ht"))
 
# -startswith() function to print
# everything except ht data
select(d, -starts_with("ht"))
 
# Printing column 1 to 2
select(d, 1: 2)
 
# Printing data of column
# heading containing 'a'
select(d, contains("a"))
 
# Printing data of column
# heading which matches 'na'
select(d, matches("na"))

输出:

# A tibble: 4 x 1
     ht
  
1    46
2    NA
3    NA
4    69

# A tibble: 4 x 3
  name      age school
       
1 Abhi        7 yes   
2 Bhavesh     5 yes   
3 Chaman      9 no    
4 Dimri      16 no

# A tibble: 4 x 2
  name      age
     
1 Abhi        7
2 Bhavesh     5
3 Chaman      9
4 Dimri      16

# A tibble: 4 x 2
  name      age
     
1 Abhi        7
2 Bhavesh     5
3 Chaman      9
4 Dimri      16

# A tibble: 4 x 1
  name   
    
1 Abhi   
2 Bhavesh
3 Chaman 
4 Dimri
  • mutate() 和 transmute():添加新变量,它们是主要变量的函数。

R

# Create a data frame with missing data
d <- data.frame( name = c("Abhi", "Bhavesh",
                          "Chaman", "Dimri"),
                 age = c(7, 5, 9, 16),
                 ht = c(46, NA, NA, 69),
                 school = c("yes", "yes", "no", "no") )
 
# Calculating a variable x3 which is sum of height
# and age printing with ht and age
mutate(d, x3 = ht + age)
 
# Calculating a variable x3 which is sum of height
# and age printing without ht and age
transmute(d, x3 = ht + age)

输出:

# A tibble: 4 x 5
  name      age    ht school    x3
         
1 Abhi        7    46 yes       53
2 Bhavesh     5    NA yes       NA
3 Chaman      9    NA no        NA
4 Dimri      16    69 no        85

# A tibble: 4 x 1
     x3
  
1    53
2    NA
3    NA
4    85
> 
  • summarise():将各种值压缩为一个值。

R

# Create a data frame with missing data
d <- data.frame( name = c("Abhi", "Bhavesh",
                          "Chaman", "Dimri"),
                 age = c(7, 5, 9, 16),
                 ht = c(46, NA, NA, 69),
                 school = c("yes", "yes", "no", "no") )
 
# Calculating mean of age
summarise(d, mean = mean(age))
 
# Calculating min of age
summarise(d, med = min(age))
 
# Calculating max of age
summarise(d, med = max(age))
 
# Calculating median of age
summarise(d, med = median(age))

输出:

# A tibble: 1 x 1
      mean
     
1     9.25

# A tibble: 1 x 1
    med
  
1     5

# A tibble: 1 x 1
    med
  
1    16

# A tibble: 1 x 1
    med
  
1     8
  • sample_n() 和 sample_frac():用于抽取随机样本。

R

# Create a data frame with missing data
d <- data.frame( name = c("Abhi", "Bhavesh",
                          "Chaman", "Dimri"),
                 age = c(7, 5, 9, 16),
                 ht = c(46, NA, NA, 69),
                 school = c("yes", "yes", "no", "no") )
 
# Printing three rows
sample_n(d, 3)
 
# Printing 50 % of the rows
sample_frac(d, 0.50)

输出:

# A tibble: 3 x 4
  name      age    ht school
        
1 Abhi        7    46 yes   
2 Bhavesh     5    NA yes   
3 Chaman      9    NA no 

# A tibble: 2 x 4
  name      age    ht school
        
1 Dimri      16    69 no    
2 Bhavesh     5    NA yes