Spaces:
Runtime error
Runtime error
#!/usr/local/bin/python3 | |
# Author: Pranab Ghosh | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); you | |
# may not use this file except in compliance with the License. You may | |
# obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or | |
# implied. See the License for the specific language governing | |
# permissions and limitations under the License. | |
# Package imports | |
import os | |
import sys | |
import random | |
import statistics | |
import matplotlib.pyplot as plt | |
import argparse | |
from matumizi.util import * | |
from matumizi.mlutil import * | |
from matumizi.daexp import * | |
from matumizi.sampler import * | |
NFEAT = 11 | |
NFEAT_EXT = 14 | |
class LoanApprove: | |
def __init__(self, numLoans=None): | |
self.numLoans = numLoans | |
self.marStatus = ["married", "single", "divorced"] | |
self.loanTerm = ["7", "15", "30"] | |
self.addExtra = False | |
def initTwo(self): | |
""" | |
initialize samplers | |
""" | |
self.approvDistr = CategoricalRejectSampler(("1", 60), ("0", 40)) | |
self.featCondDister = {} | |
#marital status | |
key = ("1", 0) | |
distr = CategoricalRejectSampler(("married", 100), ("single", 60), ("divorced", 40)) | |
self.featCondDister[key] = distr | |
key = ("0", 0) | |
distr = CategoricalRejectSampler(("married", 40), ("single", 100), ("divorced", 40)) | |
self.featCondDister[key] = distr | |
# num of children | |
key = ("1", 1) | |
distr = CategoricalRejectSampler(("1", 100), ("2", 90), ("3", 40)) | |
self.featCondDister[key] = distr | |
key = ("0", 1) | |
distr = CategoricalRejectSampler(("1", 50), ("2", 70), ("3", 100)) | |
self.featCondDister[key] = distr | |
# education | |
key = ("1", 2) | |
distr = CategoricalRejectSampler(("1", 30), ("2", 80), ("3", 100)) | |
self.featCondDister[key] = distr | |
key = ("0", 2) | |
distr = CategoricalRejectSampler(("1", 100), ("2", 40), ("3", 30)) | |
self.featCondDister[key] = distr | |
#self employed | |
key = ("1", 3) | |
distr = CategoricalRejectSampler(("1", 40), ("0", 100)) | |
self.featCondDister[key] = distr | |
key = ("0", 3) | |
distr = CategoricalRejectSampler(("1", 100), ("0", 30)) | |
self.featCondDister[key] = distr | |
# income | |
key = ("1", 4) | |
distr = GaussianRejectSampler(120,15) | |
self.featCondDister[key] = distr | |
key = ("0", 4) | |
distr = GaussianRejectSampler(50,10) | |
self.featCondDister[key] = distr | |
# years of experience | |
key = ("1", 5) | |
distr = GaussianRejectSampler(15,3) | |
self.featCondDister[key] = distr | |
key = ("0", 5) | |
distr = GaussianRejectSampler(5,1) | |
self.featCondDister[key] = distr | |
# number of years in current job | |
key = ("1", 6) | |
distr = GaussianRejectSampler(3,.5) | |
self.featCondDister[key] = distr | |
key = ("0", 6) | |
distr = GaussianRejectSampler(1,.2) | |
self.featCondDister[key] = distr | |
# outstanding debt | |
key = ("1", 7) | |
distr = GaussianRejectSampler(20,5) | |
self.featCondDister[key] = distr | |
key = ("0", 7) | |
distr = GaussianRejectSampler(60,10) | |
self.featCondDister[key] = distr | |
# loan amount | |
key = ("1", 8) | |
distr = GaussianRejectSampler(300,50) | |
self.featCondDister[key] = distr | |
key = ("0", 8) | |
distr = GaussianRejectSampler(600,50) | |
self.featCondDister[key] = distr | |
# loan term | |
key = ("1", 9) | |
distr = CategoricalRejectSampler(("7", 100), ("15", 40), ("30", 60)) | |
self.featCondDister[key] = distr | |
key = ("0", 9) | |
distr = CategoricalRejectSampler(("7", 30), ("15", 100), ("30", 60)) | |
self.featCondDister[key] = distr | |
# credit score | |
key = ("1", 10) | |
distr = GaussianRejectSampler(700,20) | |
self.featCondDister[key] = distr | |
key = ("0", 10) | |
distr = GaussianRejectSampler(500,50) | |
self.featCondDister[key] = distr | |
if self.addExtra: | |
# saving | |
key = ("1", 11) | |
distr = NormalSampler(80,10) | |
self.featCondDister[key] = distr | |
key = ("0", 11) | |
distr = NormalSampler(60,8) | |
self.featCondDister[key] = distr | |
# retirement | |
zDistr = NormalSampler(0, 0) | |
key = ("1", 12) | |
sDistr = DiscreteRejectSampler(0,1,1,20,80) | |
nzDistr = NormalSampler(100,20) | |
distr = DistrMixtureSampler(sDistr, zDistr, nzDistr) | |
self.featCondDister[key] = distr | |
key = ("0", 12) | |
sDistr = DiscreteRejectSampler(0,1,1,50,50) | |
nzDistr = NormalSampler(40,10) | |
distr = DistrMixtureSampler(sDistr, zDistr, nzDistr) | |
self.featCondDister[key] = distr | |
#num of prior mortgae loans | |
key = ("1", 13) | |
distr = DiscreteRejectSampler(0,3,1,20,60,40,15) | |
self.featCondDister[key] = distr | |
key = ("0", 13) | |
distr = DiscreteRejectSampler(0,1,1,70,30) | |
self.featCondDister[key] = distr | |
def generateTwo(self, noise, keyLen, addExtra): | |
""" | |
ancestral sampling | |
""" | |
self.addExtra = addExtra | |
self.initTwo() | |
#error | |
erDistr = GaussianRejectSampler(0, noise) | |
#sampler | |
numChildren = NFEAT_EXT if self.addExtra else NFEAT | |
sampler = AncestralSampler(self.approvDistr, self.featCondDister, numChildren) | |
for i in range(self.numLoans): | |
(claz, features) = sampler.sample() | |
# add noise | |
features[4] = int(features[4]) | |
features[7] = int(features[7]) | |
features[8] = int(features[8]) | |
features[10] = int(features[10]) | |
if self.addExtra: | |
features[11] = int(features[11]) | |
features[12] = int(features[12]) | |
claz = addNoiseCat(claz, ["0", "1"], noise) | |
strFeatures = list(map(lambda f: toStr(f, 2), features)) | |
rec = genID(keyLen) + "," + ",".join(strFeatures) + "," + claz | |
print (rec) | |
def encodeDummy(self, fileName, extra): | |
""" | |
dummy var encoding | |
""" | |
catVars = {} | |
catVars[1] = self.marStatus | |
catVars[10] = self.loanTerm | |
rSize = NFEAT_EXT if extra else NFEAT | |
rSize += 2 | |
dummyVarGen = DummyVarGenerator(rSize, catVars, "1", "0", ",") | |
for row in fileRecGen(fileName, None): | |
newRow = dummyVarGen.processRow(row) | |
print (newRow) | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--op', type=str, default = "none", help = "operation") | |
parser.add_argument('--nloan', type=int, default = 1000, help = "nom of loans") | |
parser.add_argument('--noise', type=float, default = 0.1, help = "nom of loans") | |
parser.add_argument('--klen', type=int, default = 1000, help = "key length") | |
parser.add_argument('--fpath', type=str, default = "none", help = "source file path") | |
parser.add_argument('--algo', type=str, default = "none", help = "source file path") | |
args = parser.parse_args() | |
op = args.op | |
if op == "gen": | |
""" generate data """ | |
numLoans = args.nloan | |
loan = LoanApprove(numLoans) | |
noise = args.noise | |
keyLen = args.klen | |
addExtra = True | |
loan.generateTwo(noise, keyLen, addExtra) | |
elif op == "encd": | |
""" encode binary """ | |
fileName = args.fpath | |
extra = True | |
loan = LoanApprove() | |
loan.encodeDummy(fileName, extra) | |
elif op == "fsel": | |
""" feature select """ | |
fpath = args.fpath | |
algo = args.algo | |
expl = DataExplorer(False) | |
expl.addFileNumericData(fpath, 5, 8, 11, 12, "income", "debt", "crscore", "saving") | |
expl.addFileCatData(fpath, 3, 4, 15, "education", "selfemp", "target") | |
fdt = ["education", "cat", "selfemp", "cat", "income", "num", "debt", "num", "crscore", "num"] | |
tdt = ["target", "cat"] | |
if args.algo == "mrmr": | |
res = expl.getMaxRelMinRedFeatures(fdt, tdt, 3) | |
elif args.algo == "jmi": | |
res = expl.getJointMutInfoFeatures(fdt, tdt, 3) | |
elif args.algo == "cmim": | |
res = expl.getCondMutInfoMaxFeatures(fdt, tdt, 3) | |
elif args.algo == "icap": | |
res = expl.getInteractCapFeatures(fdt, tdt, 3) | |
elif args.algo == "infg": | |
res = expl.getInfoGainFeatures(fdt, tdt, 3, 8) | |
print(res) | |
else: | |
exitWithMsg("invalid command") |