In [47]:
# get directory
import os

# print the directory
print(os.getcwd())
/Users/loanrobinson/Documents/Desktop/dataset
In [48]:
# Change directory
os.chdir("/Users/loanrobinson/Documents/Desktop/dataset")
In [49]:
# print my new directory
path = os.getcwd()
print ("My current working dirrectory is %s" % path)
My current working dirrectory is /Users/loanrobinson/Documents/Desktop/dataset
In [50]:
# list all of file in directory
print(os.listdir())
['trans.R', 'Descriptive table.docx', 'Lab6ex2.R', '.DS_Store', 'lab6.pdf', 'CH01TA01.txt', 'CH03TA01.txt', 'data.R', 'DAdataFinalDA16.csv', 'CH03TA10.txt', 'R7.pdf', 'Lab6ex1.R', 'CH03TA07.txt', 'DataExam2016FinalComplete.pdf', 'latexFineControl.pdf', 'dataSet.R', 'CH03TA08.txt', 'subSetData.xlsx', 'Lab6ex2.csv', '.Rapp.history', 'Lab6ex1.csv', 'subSetData.csv', 'lect_12.pdf']
In [51]:
# Read only csv files
import glob
for name in glob.glob("*.csv"):
    print (name)
DAdataFinalDA16.csv
Lab6ex2.csv
Lab6ex1.csv
subSetData.csv
In [52]:
print(glob.glob("*.csv"))
['DAdataFinalDA16.csv', 'Lab6ex2.csv', 'Lab6ex1.csv', 'subSetData.csv']
In [53]:
# Read file start with Lab and format csv
files = sorted(glob.glob("Lab*.csv"))
print(files)
['Lab6ex1.csv', 'Lab6ex2.csv']
In [54]:
# Read all of csv file in directory
import pandas as pd

# substring text to get only names of file, before ".csv"
list_names = [i.split(".")[0] for i in files]
print(list_names)
['Lab6ex1', 'Lab6ex2']
In [55]:
# I need to remove double quote, to use dataframe
names = ','.join(list_names)
print(names)
Lab6ex1,Lab6ex2
In [56]:
# read all of files
dfList = [pd.read_csv(f) for f in files]

# check the first data in a list
print(dfList[0].head(4))
print(dfList[1].head(4))
  trtmt block  weight
0    MC     I    0.18
1    MC    II    0.30
2    MC   III    0.28
3    MC    IV    0.44
  trtmt  response
0     A       220
1     B        96
2     C        62
3     D       378
In [57]:
print(dfList[0].describe())
print(dfList[0].columns)
type(dfList[0])
           weight
count   24.000000
mean    49.000000
std     69.647374
min      0.180000
25%      0.435000
50%      2.650000
75%    129.000000
max    176.000000
Index(['trtmt', 'block', 'weight'], dtype='object')
Out[57]:
pandas.core.frame.DataFrame
In [58]:
# set names of data frame
Lab6ex1,Lab6ex2 = [pd.DataFrame(i) for i in dfList]
print(Lab6ex2)
   trtmt  response
0      A       220
1      B        96
2      C        62
3      D       378
4      E       197
5      F        77
6      A       200
7      B       213
8      C        75
9      D       323
10     E       100
11     F        80
12     A       311
13     B       142
14     C        94
15     D       228
16     E       139
17     F       123
18     A       196
19     B       154
20     C        92
21     D       177
22     E       198
23     F       118
24     A       262
25     B       151
26     C        88
27     D       265
28     E       131
29     F       101
In [59]:
Lab6ex2.iloc[0:5,0:2] # Select first 5 rows, column 1 to column 2
Out[59]:
trtmt response
0 A 220
1 B 96
2 C 62
3 D 378
4 E 197