R 기초 정리

라이브러리 사용

library(realxl) -> 엑셀 데이터를 처리하기 위한 패키지
library(XML) -> XML 데이터를 사용하기 위한 패키지
library(jsonlite) -> json 데이터를 사용하기 위한 패키지
library(psych) -> 왜도와 첨도를 구하는 패키지
library(descr) -> 빈도분석을 하는 패키지
library(MASS) -> MASS 내장 데이터를 사용하기 위한 패키지
library(rvest) -> 웹 페이지를 스크래핑하기 위한 패키지
library(stringr) -> 문자열을 가공하기 위한 패키지
library(dplyr) -> 데이터 세트의 rename을 사용하기 위한 패키지

Work Directory 설정

setwd("C:/Users/kkeun/R_")     #work directory 설정

엑셀 데이터 처리

library(readxl)     #readxl 라이브러리 불러오기
exdata1 <- read_excel("Sample.xlsx")    #excel 데이터 불러오기

View(exdata1)     # View
str(exdata1)        #데이터 세트의 변수 속성
dim(exdata1)      #데이터 세트의 행/열 확인
ls(exdata1)         #데이터 세트의 컬럼 명 확인

library(dplyr) #rename 사용 패키지
exdata1 <- rename(exdata1, Y17_AMT=AMT17, Y16_AMT=AMT16)   #데이터 세트의 column 명 변경 / 주의 할 점 : 바꾸고 싶은 이름이 앞으로 바뀌는 대상이 뒤로

exdata1$AMT <- exdata1$Y17_AMT + exdata1$Y16_AMT #파생변수 생성
exdata1$CNT <- exdata1$Y17_CNT + exdata1$Y16_CNT
exdata1$AVG_AMT <- exdata1$AMT / exdata1$CNT
exdata1$AGE50_YN <- ifelse(exdata1$AGE>=50, "Y", "N")    #ifelse(조건, 참, 거짓)

데이터 프레임(csv) 처리

setwd("C:/Users/kkeun/R_")     #work directory 설정
options("width"=500)   # width 값 500으로 설정
data <- read.csv("data.csv", header=T, fileEncoding = "EUC-KR")   #csv 파일 불러오기

apply_data <- subset(data, select=c(쇼핑1월, 쇼핑2월, 쇼핑3월))   # 쇼핑1월, 쇼핑2월, 쇼핑3월 데이터 추출해 서 apply_data 변수에 담기
View(apply_data)

apply(apply_data, 1, max)

# apply 함수 -> 행과 열에 원하는 함수를 일괄적으로 적용하는 함수
# apply(data, 1, function) -> data의 행에 function 적용
# apply(data, 2, function) -> daat의 열에 function 적용

apply(apply_data, 2, sum) # apply_data의 열에 sum 함수 적용

# laaply 함수는 연산 결과를 리스트로 반환 / 벡터, 행렬, 리스트, 데이터프레임 등 모두에 사용 가능
# sapply 함수는 연산 결과를 벡터로 반환

x <- matrix(1:4, 2, 2) # 데이터프레임인데 값이 모두 숫자로 구성
apply(x, 1, sum)
apply(x, 2, min)
apply(x, 1, max)

apply(iris[, 1:4],2,sum)
apply(iris[, 1:4],2,mean)
apply(iris[, 1:4],2,min)
apply(iris[, 1:4],2,max)
apply(iris[, 1:4],2,median)

lapply(iris[, 1:4], sum)
sapply(iris[, 1:4], mean)

write.csv(data, "save_data.csv", row.names=TRUE) # csv 저장 row.names=TRUE 는 row 이름을 그대로 사용

data <- read.csv("data.csv", header=T, fileEncoding = "EUC-KR") #csv 파일 불러오기
View(data) # View로 데이터 보기

data$변수 <- 'new' #데이터 세트에 새로운 column 등록 및 값 설정
data$쇼핑합계 <- data$쇼핑1월 + data$쇼핑2월 + data$쇼핑3월 #새로운 column 등록 및 값 설정
data$쇼핑평균 <- mean(data$쇼핑1월 + data$쇼핑2월 + data$쇼핑3월)#새로운 column 등록 및 값 설정

data$변수 <- 'change' #데이터 세트의 column 값 변경

data$성별num <- as.factor(data$성별) #factor화 (요인화)
data$성별num <- as.numeric(data$성별num) #numeric화 (숫자화)

data_man <- subset(data, data$성별 == '남자') #데이터 분할 성별이 남자인 사람
data_woman <- subset(data, data$성별 == '여자') #데이터 분할 성별이 여자인 사람

sub_data <- subset(data, select=c(성별, 연령대, 직업, 쇼핑액)) #데이터 분할 특정 column
View(sub_data)

sub_data1 <- subset(data, select=c(성별, 연령대))
sub_data2 <- subset(data, select=c(직업, 쇼핑액))
cmerge_data <- cbind(sub_data1, sub_data2) #수직적 자료의 병합 / row 수가 맞지 않으면 수직적 병합 불가 능 / column 기준 병합
View(cmerge_data)

rmerge_data <- rbind(data_man, data_woman) #수평적 자료의 병합 / row 기준 병합
View(rmerge_data)

order_data <- data[order(data$쇼핑액),] #데이터세트를 특정 column 기준으로 정렬
order_data <- data[order(data$성별, data$쇼핑액),] #성별과 쇼핑액 기준으로 정렬
order_data <- data[order(data$성별, -data$쇼핑액),] #성별과 쇼핑액(내림차순)으로 정렬
View(order_data)

sub_data3 <- subset(data, select=c(쇼핑1월, 쇼핑2월, 쇼핑3월))
apply(sub_data3, 1, max) # row 기준 max 함수 적용 / 1 = row
apply(sub_data3, 2, sum) # column 기준 sum 함수 적용 / 2 = column

R 내장 데이터 처리

AirPassengers # R 내장 데이터

library("MASS")
data(Boston) # MASS 패키지 내장 데이터

data <- read.csv("6_25.csv", header=T, fileEncoding = "EUC-KR") #csv 파일 불러오기
View(data)

data("iris") #내장 데이터 "iris" 확인하기

str(iris) #데이터 구조 및 요약 확인하기 / 전체적인 데이터 구조 파악

ncol(iris) #데이터프레임 column(열) 개수 확인

nrow(iris) #데이터프레임 row(관측치/행) 개수 확인

dim(iris) #데이터프레임의 row, column 확인

length(iris) #데이터프레임의 colums 개수 확인 / 데이터의 개수 확인

data <- c(1,2,3,4,5)
length(data) #데이터 개수 확인
length(iris$Species) # 데이터 프레임 특정 열의 데이터 개수 확인

ls(iris) #데이터세트의 column 명 확인

tail(iris) #데이터세트의 뒤에서 5개 확인 / default = 5 / head도 똑같음
tail(iris, n=3) #데이터세트의 뒤에서 3개 확인

mean(iris$Sepal.Length) #평균
median(iris$Sepal.Length) # 중앙값

min(iris$Sepal.Length) #최소값
max(iris$Sepal.Length) #최대값
range(iris$Sepal.Length) #최소값에서 최대값의 범위

quantile(iris$Sepal.Length) #분위수 구하기
quantile(iris$Sepal.Length, probs=0.25) #제1사분위수
quantile(iris$Sepal.Length, probs=0.50) #제2사분위수
quantile(iris$Sepal.Length, probs=0.75) #제3사분위수
quantile(iris$Sepal.Length, probs=0.80) #제0.8분위수

var(iris$Sepal.Length) #분산 구하기 / 평균으로부터 퍼진 정도
sd(iris$Sepal.Length) #표준편차 구하기 / 데이터 값이 퍼진 정도

library("psych") #psych 라이브러리 사용
kurtosi(iris$Sepal.Length) #첨도 구하기 / 확률분포의 꼬리가 두꺼운 정도를 나타내는 척도
skew(iris$Sepal.Length) #왜도 구하기 / 실수 값 확률 변수의 확률 분포 비대칭성을 나타내는 지표

library("descr") #descr 라이브러리 사용
freq_test <- freq(iris$Sepal.Length, plot=F) #빈도분석 하기 / plot=F 이면 그래프 제외
freq_test #빈도분석

names(iris) # column 이름 출력

class(iris) # 데이터세트의 자료구조 유형 출력

그래프 기초

data <- c(5, 7, 3, 4, 5, 9, 10)

barplot(data)
barplot(data, horiz=TRUE)

data <- matrix(c(5,9,10,3,5,7,3,4),4,2)

data

barplot(data)

barplot(data, beside = T, main="학생수", legend=c("1학년", "2학년", "2학년", "3학년"))

x <- rnorm(300, mean=10, sd=2) #평균이 10, 표준편차가 2인인 300개의 샘플 생성성

hist(x) #히스토그램 작성
hist(x, freq=F) #freq=F -> 빈도가 아닌 밀도로 표시
lines(density(x)) #확률밀도 히스토그램에 선추가

x <- c(182,190,213,205,231,250,242)

#plot(x, type="l")

y <- c(190,180,200,210,220,234,235)

#plot(x, type="o", col="red", xlab="년도", ylab="억원", main="매출현황")
#lines(y, type="o", col="blue")

z <- c(195,185,190,215,220,230,225)
data <- cbind(x,y,z)
#rbind(A,B) -> 데이터 프레임 행 결합
#cbind(A,B) -> 데이터 프레임 열 결합
#merge(A,B,by='key') -> 동일 key 값 기준으로 결합

matplot(data, type="b", col=2:4, pch=1)
lnd <- c("2018년 매출", "2019년 매출", "2020년 매출")
legend("topleft", legend=lnd, col=2:4, pch=1, cex=0.3)

data <- c(5, 7, 3, 4, 5, 9, 10)

barplot(data)
par(new=T)
plot(data, type="o")

par("mar")
par(mar=c(1,1,1,1))

x <- c(29, 14, 9, 26, 15, 13, 28, 24, 17, 4, 19, 22, 2, 25, 8, 6, 16, 18, 21, 30)
y <- c(9, 3, 26, 27, 10, 21, 8, 4, 28, 24, 5, 6, 22, 29, 20, 25, 12, 1, 2, 15)

par(mfrow=c(3,3))

plot(x,y,main="Plot p-type", xlab="x-label", ylab="y-label", type="p")
plot(x,y,main="Plot l-type", xlab="x-label", ylab="y-label", type="l")
plot(x,y,main="Plot b-type", xlab="x-label", ylab="y-label", type="b")
plot(x,y,main="Plot c-type", xlab="x-label", ylab="y-label", type="c")
plot(x,y,main="Plot o-type", xlab="x-label", ylab="y-label", type="o")
plot(x,y,main="Plot h-type", xlab="x-label", ylab="y-label", type="h")
plot(x,y,main="Plot s-type", xlab="x-label", ylab="y-label", type="s")
plot(x,y,main="Plot S-type", xlab="x-label", ylab="y-label", type="S")
plot(x,y,main="Plot n-type", xlab="x-label", ylab="y-label", type="n")

data <- c(280, 170, 120, 100, 85)

pie(data)

lbl <- c("서울", "부산", "경북", "전남", "충청")
pct <- round(data/sum(data)*100)
pct

lbl <- paste(lbl, pct)
lbl

lbl <- paste(lbl,"%", sep="")
lbl

pie(data, label=lbl, radius=1.2)

x <- c(29, 14, 9, 26, 15, 13, 28, 24, 17, 4, 19, 22, 2, 25, 8, 6, 16, 18, 21, 30)
y <- c(9, 3, 26, 27, 10, 21, 8, 4, 28, 24, 5, 6, 22, 29, 20, 25, 12, 1, 2, 15)

x <- sort(x)
y <- sort(y)

plot(x,y, main="plot의 제목", sub="plot의 부제목",
    xlab="x축의 제목", ylab="y축의 제목목", col="6", pch=11)

data <- c(1,2,3,4,5,6,7)
setwd("E:/Directory/Directory_2/R_")
pdf("bargraph.pdf") #pdf 파일로 저장
barplot(data)
dev.off() #저장 완료

xml 데이터 처리

library(XML) #XML 라이브러리 사용
setwd("C:/Users/rmsgu/R_") #work directory 설정

xml_data <- xmlToDataFrame("data_ex.xml") #xml 데이터를 데이터프레임 형태로 가져오기
View(xml_data) #xml 데이터 보기

json 데이터 처리

library(jsonlite) #jsonlite 라이브러리 사용
json_data <- fromJSON("data_ex.json") #json 데이터를 가져오기
str(json_data) #json 데이터를 str 형태로 보기

온도 나누기 함수 기초

tmp <- c(18.5, 20.0, 20.1, 20.3, 22.1, 19.8, 19.2, 18.2,
         17.6, 14.8, 18.0, 17.3, 16.3, 14.6, 11.9)

N <- length(tmp)
low_days <- 0
high_days <- 0
for(i in 1:N){
  if(tmp[i] < 20){
    low_days <- low_days + 1}
  else{
    high_days <- high_days + 1}
}
low_days
high_days

Get_tmp_days <- function(x, y){
  N <- length(x)
  low_days <- 0
  high_days <- 0
  for(i in 1:N){
    if(x[i] < y){
      low_days <- low_days + 1}
    else{
      high_days <- high_days + 1}
  }
  c("low_days : ", low_days, "high_days : ", high_days)
}

tmp <- c(18.5, 20.0, 20.1, 20.3, 22.1, 19.8, 19.2, 18.2,
         17.6, 14.8, 18.0, 17.3, 16.3, 14.6, 11.9)

Get_tmp_days(tmp, 20)

url로 웹 스크래핑 후 데이터 처리

library(rvest)
library(stringr)

url <- "https://www.bobaedream.co.kr/cyber/CyberCar.php?sel_m_gubun=ALL" # 웹 상에서 url 가져오기
usedCar <- read_html(url) # read_html 함수를 이용해서 url 읽기 및 저장
usedCar

carinfo <- html_nodes(usedCar, css=".product-item") # html_nodes 함수를 이용해서 특정 css 속성 값을 가진 객체를 가져오기
head(carinfo)
carinfo

title_tmp <- html_nodes(carinfo, css=".tit.ellipsis") #html_nodes 함수를 이용하여 특정 css 속성 값 가져오기 / 공백 x . 으로 대체
title <- html_text(title_tmp) # html_text 함수를 이용해서 문자열 추출
title <- str_trim(title) # str_trim 함수는 문자열의 공백 제거 함수
title

year_tmp <- html_nodes(carinfo, css=".mode-cell.year") # 차량 연식 추출
year <- html_text(year_tmp)
year <- str_trim(year)
year

fuel_tmp <- html_nodes(carinfo, css=".mode-cell.fuel")
fuel <- html_text(fuel_tmp)
fuel <- str_trim(fuel)

km_tmp <- html_nodes(carinfo, css=".mode-cell.km")
km <- html_text(km_tmp)
km <- str_trim(km)

price_tmp <- html_nodes(carinfo, css=".mode-cell.price")
price <- html_text(price_tmp)
price <- str_trim(price)
price <- str_replace(price, '\n','') #문자열 변경 \n 을 ''로 변경

maker <- c()
for(i in 1:length(title)){
  maker <- c(maker, unlist(str_split(title[i], ' '))[1]) # 제조사 추출
}

car <- data.frame(title, year, fuel, km, price, maker) # 데이터 프레임 생성
View(car)

car$km <- gsub("만km", "0000", car$km) # 데이터프레임의 값 이름 변경
car$km <- gsub("천km", "000", car$km)
car$km <- gsub("km", "", car$km)
car$km <- gsub("미등록", "", car$km)
car$km <- as.numeric(car$km)

car$price <- gsub("만원", "", car$price)
car$price <- gsub("계약", "", car$price)
car$price <- gsub("팔림", "", car$price)
car$price <- gsub("금융리스", "", car$price)
car$price <- gsub(",", "", car$price)
car$price <- as.numeric(car$price)
View(car)

'시험공부' 카테고리의 다른 글

빅 데이터 관련 이론 정리 (0)	2022.04.18

R 기초 정리

'시험공부' 카테고리의 다른 글

'시험공부' Related Articles

티스토리툴바