Skip to content
Snippets Groups Projects
Commit ef073e0f authored by yangyus's avatar yangyus
Browse files

Upload New File

parent f676999c
No related branches found
No related tags found
No related merge requests found
hw3.Rmd 0 → 100644
---
title: "hw3"
author: "Yushan Yang"
date: "2022-11-22"
output:
pdf_document:
extra_dependencies: ["float"]
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo =TRUE, eval = FALSE,fig.pos = "!H",out.extra = "",align="center")
```
Gitlab link: [STATS506 HW3](https://gitlab.eecs.umich.edu/yangyus/stats-506-hw3)
Q1
(1) My system is Mac.
The global (or user) level git configuration files are located on: /Users/yushanyang/.gitconfig
The project (or local) level git configuration files are located on: /Users/yushanyang/.git/config
![Global level git configuration file](figure/global_git_configure.png)
![The content of .gitignore file](figure/gitignore.png)
(2)
```{R}
read_lines <- function(n1,n2){
filepath <- "Data/2020_Business_Academic_QCQ.txt"
# read files line by line
conn <- file(filepath,'r')
line <- readLines(conn, n=1)
# separate each line and select needed columns
line_vector <- unlist(strsplit(line,"\""))
index <- which(line_vector %in% c('State','County Code','Employee Size (5) - Location',
'Sales Volume (9) - Location','Census Tract'))
line_vector <- line_vector[index]
# create a dataframe
df <- data.frame(matrix(ncol=5,nrow=n2-n1))
# rename the columns
colnames(df) <- c('state','county_code','employee_size_location',
'sales_volume_location','census_tract')
# store selected rows to the dataframe
cnt <- 1
while(cnt < n2){
if(cnt < n1){
line <- readLines(conn, n=1)
cnt <- cnt + 1
next
}
line <- readLines(conn, n=1)
line_vector <- unlist(strsplit(line,"\""))
line_vector <- line_vector[index]
df[cnt-n1+1,] <- line_vector
cnt <- cnt + 1
}
close(conn)
remove(filepath, index, line, line_vector, conn)
return(df)
}
```
(3)
```{R}
# import library
library(dplyr)
# initialize the dataframe
n1 <- 1
n2 <- n1+20000
df1 <- read_lines(n1,n2)
df1$employee_size_location <- as.integer(df1$employee_size_location)
df1$sales_volume_location <- as.integer(df1$sales_volume_location)
df1[is.na(df1)] <- 0
df1 <- df1 %>% group_by(state, county_code, census_tract) %>%
summarise(employee_size_sum = sum(employee_size_location),
sales_volume_sum = sum(sales_volume_location))
df1 <- df1[df1$state == 'AL',]
# aggregate columns and combine batches
while(n2 < 300001){
n1 <- n2
n2 <- n1+20000
df2 <- read_lines(n1,n2)
df2$employee_size_location <- as.integer(df2$employee_size_location)
df2$sales_volume_location <- as.integer(df2$sales_volume_location)
df2[is.na(df2)] <- 0
df2 <- df2 %>% group_by(state, county_code, census_tract) %>%
summarise(employee_size_sum = sum(employee_size_location),
sales_volume_sum = sum(sales_volume_location))
df2 <- df2[df2$state == 'AL',]
df1 <- rbind(df1,df2)
}
df1 <- df1 %>% group_by(state, county_code, census_tract) %>%
summarise(employee_size_sum = sum(employee_size_sum),
sales_volume_sum = sum(sales_volume_sum))
remove(n1,n2)
remove(df2)
```
(4)
```{R}
# connect to MySQL
library(RMySQL)
library(getPass)
conn = dbConnect(RMySQL::MySQL(),
dbname='Hw3db',
host='localhost',
port=3306,
user='root',
password='Young1993,.')
# write df1 to df1 table
dbWriteTable(conn = conn, name = 'df1', value=df1, append=TRUE, row.names=FALSE)
remove(df1)
```
![df1 table creation in MySQL command line](figure/df1.png)
(5)
```{R}
df <- dbGetQuery(conn=conn,
statement='SELECT census_tract, sales_volume_sum FROM df1
ORDER BY sales_volume_sum DESC LIMIT 10')
```
![Top 10 sales amount tract](figure/top10_sales_amount_tract.png)
(6) ![Git commit and create a new branch](figure/commit_and_create_branch.png)
(7)
```{R}
# load data
df <- read.csv('Data/AL.csv')
df <- df[,c('FIELD19','FIELD20','FIELD22','FIELD45','FIELD64','FIELD65')]
# change column names
colnames(df) <- c('household_wealth','household_income','estmtd_home_val','state',
'county_code','census_tract')
# change column data type
df$household_wealth <- as.integer(df$household_wealth)
df$household_income <- as.integer(df$household_income)
df$estmtd_home_val <- as.integer(df$estmtd_home_val)
# remove 0 home valuation
df <- df[df$estmtd_home_val !=0,]
# summarize dataframe
df2 <- df %>% group_by(state, county_code, census_tract) %>%
summarize(household_wealth_sum = sum(household_wealth),
household_income_sum = sum(household_income),
estmtd_home_val_sum = sum(estmtd_home_val))
remove(df)
# change column data type and add leading zeros
df2$county_code <- as.character(df2$county_code)
df2$census_tract <- as.character(df2$census_tract)
county_code <- rep('',dim(df2)[1])
census_tract <- rep('',dim(df2)[1])
for(i in 1:dim(df2)[1]){
county_code_num = nchar(df2$county_code[i])
if(county_code_num < 3){
county_code[i] <- paste0(strrep('0',3-county_code_num),df2$county_code[i])
}
else{
county_code[i] <- df2$county_code[i]
}
census_tract_num = nchar(df2$census_tract[i])
if(census_tract_num < 6){
census_tract[i] <- paste0(strrep('0',6-census_tract_num),df2$census_tract[i])
}
else{
census_tract[i] <- df2$census_tract[i]
}
}
df2$county_code <- county_code
df2$census_tract <- census_tract
remove(county_code,census_tract,county_code_num,census_tract_num)
```
(8)
```{R}
# write df2 to df2 table
dbWriteTable(conn = conn, name = 'df2', value=df2, append=TRUE, row.names=FALSE)
remove(df2)
```
![df2 table creation in MySQL command line](figure/df2.png)
(9) ![The second git commit](figure/commit2.png)
![The git log](figure/git_log.png)
HEAD means the current branch.
(10)
```{R}
library(tidycensus)
# check available variables
var <- load_variables(2010,dataset='sf1')
var <- var[grepl("^H", var$name), ]
# take out total household number with race and total
# white household and calculate white household percent
race_total <- get_decennial(geography="tract",
variables="H006001",
state="01",
year=2010,
geometry=FALSE)[,c(1,4)]
race_white <- get_decennial(geography="tract",
variables="H006002",
state="01",
year=2010,
geometry=FALSE)[,c(1,4)]
race_white$state <- rep('AL',dim(race_white)[1])
race_white$county_code <- substr(race_white$GEOID,3,5)
race_white$census_tract <- substr(race_white$GEOID,6,11)
race_white$percent <- round(race_white$value/race_total$value*100,1)
race_white <- race_white[,c(3,4,5,6)]
race_white <- race_white[-is.na(race_white$percent),]
remove(var)
remove(race_total)
# write race_white to df3 table
dbWriteTable(conn = conn, name = 'df3', value=race_white, row.names=FALSE, append=TRUE)
remove(race_white)
```
![df3 table creation in MySQL command line](figure/df3.png)
(11)
```{R}
# combine data
df <- dbGetQuery(conn=conn,
statement=
'SELECT df1.state AS state, df1.county_code AS county_code,
df1.census_tract AS census_tract, employee_size_sum, sales_volume_sum,
household_wealth_sum, household_income_sum, estmtd_home_val_sum, percent
FROM (df1 JOIN df2 ON df1.census_tract = df2.census_tract AND df1.county_code =
df2.county_code)
JOIN df3 ON df1.census_tract = df3.census_tract AND df1.county_code = df3.county_code')
```
Key variables controlled in the model: time data collected, if it is a university zone.
(12) ![The second git log](figure/git_log2.png)
![Merge branch to main](figure/git_merge.png)
To reset the respository, type: git reset -hard(branch code)
(13)
```{R}
lr <- lm(estmtd_home_val_sum~employee_size_sum+sales_volume_sum+household_wealth_sum+
household_income_sum+percent, data=df)
summary(lr)
```
![Fit result](figure/fit.png)
Fit the data with a linear regression model where the response is the home value summation in a tract and the variables are: (1) employee size summation in a tract; (2) sales volume summation in a tract; (3) household wealth summation in a tract; (4) household income summation in a tract; (5) percent of white race household in a tract.
The result shows the property evaluation doesn't exhibit racial bias in the state of Alabama with p-value = 0.31 > 0.05. When total home value increases by 1 unit (\$1000), total employee size will increase by 2.93 when controlling other variables; total sales volume (annual revenues) will increase by 0.037 unit when holding other variables unchanged; total household wealth will increase by 0.058 unit (\$1000) when keeping other variables; and total household income will increase 1.3 units (\$1000) when controlling other variables.
Q2
(1) A cluster is composed of many servers connected to each other and each server is seen as a node. The core is the part of processor that does the computations and a node has many cores.
The login node is for preparing to run a program. The compute node is for running the program.
(2) ![SLURM script example](figure/SLURM.png)
(3) ![Create a symbolic link from home directory to scratch directory](figure/scratch.png)
The full path is: /scratch/stats506s001f22_class_root/stats506s001f22_class/yangyus
If the link is deleted, the original directory is not affected.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment