- shell script Select all
# upgrade pip
# curl https://bootstrap.pypa.io/get-pip.py | sudo python
curl https://bootstrap.pypa.io/get-pip.py | python
# install packages
sudo pip install requests==2.18.4 turicreate==5.0b1
(1) Test turicreate example - Image Classifier
- shell script Select all
mkdir -p $HOME/MLClassifier
cd $HOME/MLClassifier
# download dataset and cleanup
curl -L -o dataset.zip https://drive.google.com/uc?id=1ZLigrn7YcETalcj2qK6UqXceDdOV3244&export=download
unzip dataset.zip
rm -fr __MACOSX; rm dataset/.DS_Store dataset/*/.DS_Store
# create python script
cat > classifier.py << 'EOF'
import turicreate as turi
# load images from dataset folder
url = "dataset/"
data = turi.image_analysis.load_images(url)
# define image categories
data["foodType"] = data["path"].apply(lambda path: "Rice" if "rice" in path else "Soup")
# create sframe
data.save("rice_or_soup.sframe")
# preview dataset
data.explore()
# load sframe
dataBuffer = turi.SFrame("rice_or_soup.sframe")
# create training data using 90% of dataset
trainingBuffers, testingBuffers = dataBuffer.random_split(0.9)
# create model
model = turi.image_classifier.create(trainingBuffers, target="foodType", model="squeezenet_v1.1", max_iterations=100)
# Alternate model use ResNet-50
# model = turi.image_classifier.create(trainingBuffers, target="foodType", model="resnet-50")
# evaluate model
evaluations = model.evaluate(testingBuffers)
print evaluations["accuracy"]
# save model
model.save("rice_or_soup.model")
model.export_coreml("RiceSoupClassifier.mlmodel")
EOF
#run script
python classifier.py
(2) Test turicreate example - Logistic Regression
- shell script Select all
mkdir -p $HOME/LGClassifier
cd $HOME/LGClassifier
# create python script
cat > classifier.py << 'EOF'
import turicreate as turi
data = turi.SFrame('http://static.turi.com/datasets/regression/yelp-data.csv')
data['is_good'] = data['stars'] >= 3
# create sframe
data.save("yelp.sframe")
# preview dataset
#data.show()
# load sframe
dataBuffer = turi.SFrame("yelp.sframe")
# create training data using 80% of dataset
train_data, test_data = dataBuffer.random_split(0.8)
# create model
model=turi.logistic_classifier.create(train_data, target='is_good',
features = ['user_avg_stars',
'business_avg_stars',
'user_review_count',
'business_review_count',
'city',
'categories_dict'],
max_iterations=200)
print model
# save predictions
predictions = model.classify(test_data)
print predictions
# evaluate model
evaluations = model.evaluate(test_data)
print "Accuracy : %s" % evaluations["accuracy"]
print "Confusion Matrix : \n%s" % evaluations["confusion_matrix"]
EOF
#run script
python classifier.py
(3) Some data manipulation tips when preparing training data
- shell script Select all
# remove the quotes (replace the number with the quotes with the number without them) in csv file, typically "save as CSV" from excel file.
# for example, "222,267.87","455,365.44",... convert to 222267.87,455365.44,...
#In shell script
cat exceldata.csv | perl -p -e 's/,(?=[\d,.]*\d")//g and s/"(\d[\d,.]*)"/\1/g' > dataset.csv
# use map, lambda and zip functions when convert and compute numeric data from 2 data columns
#In python script
import math
data['rate'] = map(lambda (x,y): 0 if x is None or y is None else (0 if math.isnan(x) or math.isnan(y) or math.isinf(y) or x==0 else (999999 if math.isinf(x) or y==0 else 999999 if x/y > 999999 else x/y)) , zip(data['OS'], data['Total Amount']))
# replace training data when values are inf(infinity) or nan(Not A Number) in 'amount' column
#In python script
import math
train_data['amount'] = train_data['amount'].apply(lambda x: 0 if math.isnan(x) else x)
train_data['amount'] = train_data['amount'].apply(lambda x: 999 if math.isinf(x) else x)
# or use nested if else
#In python script
import math
train_data['amount'] = train_data['amount'].apply(lambda x: 0 if math.isnan(x) else (999 if math.isinf(x) else x ))
print train_data['amount'].summary()
# remove rows in training data with inf(infinity) or nan(Not A Number) values in 'amount' column
#In python script
import math
train_data = train_data[train_data['amount'].apply(lambda x: 0 if math.isinf(x) or math.isnan(x) else 1)]
# SFrame methods but beware, some of the methods are not working
https://apple.github.io/turicreate/docs/api/generated/turicreate.SFrame.html
# Other SFrame data manipulation examples
https://github.com/apple/turicreate/blob/master/userguide/sframe/data-manipulation.md
(4) Some data examination tips
- shell script Select all
# summary
print train_data['amount'].summary()
# crosstab
import pandas as pd
pd.crosstab(data["Rating"], data["is_bad"], margins=True)
# custom frequency count for 'amount' column
import pandas as pd
pd.crosstab(train_data['amount'].apply(lambda x: " 0-10" if x <=10 else ("10-20" if x <=20 else ("20-30" if x <=30 else ("30-40" if x <=30 else ("40-50" if x <=50 else ">50"))))), "Count")
No comments:
Post a Comment