Priyanka-Kumavat-At-TE's picture
Upload 20 files
3d0e51e
raw
history blame
7.66 kB
#!/usr/local/bin/python3
# Author: Pranab Ghosh
#
# Licensed under the Apache License, Version 2.0 (the "License"); you
# may not use this file except in compliance with the License. You may
# obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied. See the License for the specific language governing
# permissions and limitations under the License.
# Package imports
import os
import sys
import random
import statistics
import matplotlib.pyplot as plt
import argparse
from matumizi.util import *
from matumizi.mlutil import *
from matumizi.daexp import *
from matumizi.sampler import *
NFEAT = 11
NFEAT_EXT = 14
class LoanApprove:
def __init__(self, numLoans=None):
self.numLoans = numLoans
self.marStatus = ["married", "single", "divorced"]
self.loanTerm = ["7", "15", "30"]
self.addExtra = False
def initTwo(self):
"""
initialize samplers
"""
self.approvDistr = CategoricalRejectSampler(("1", 60), ("0", 40))
self.featCondDister = {}
#marital status
key = ("1", 0)
distr = CategoricalRejectSampler(("married", 100), ("single", 60), ("divorced", 40))
self.featCondDister[key] = distr
key = ("0", 0)
distr = CategoricalRejectSampler(("married", 40), ("single", 100), ("divorced", 40))
self.featCondDister[key] = distr
# num of children
key = ("1", 1)
distr = CategoricalRejectSampler(("1", 100), ("2", 90), ("3", 40))
self.featCondDister[key] = distr
key = ("0", 1)
distr = CategoricalRejectSampler(("1", 50), ("2", 70), ("3", 100))
self.featCondDister[key] = distr
# education
key = ("1", 2)
distr = CategoricalRejectSampler(("1", 30), ("2", 80), ("3", 100))
self.featCondDister[key] = distr
key = ("0", 2)
distr = CategoricalRejectSampler(("1", 100), ("2", 40), ("3", 30))
self.featCondDister[key] = distr
#self employed
key = ("1", 3)
distr = CategoricalRejectSampler(("1", 40), ("0", 100))
self.featCondDister[key] = distr
key = ("0", 3)
distr = CategoricalRejectSampler(("1", 100), ("0", 30))
self.featCondDister[key] = distr
# income
key = ("1", 4)
distr = GaussianRejectSampler(120,15)
self.featCondDister[key] = distr
key = ("0", 4)
distr = GaussianRejectSampler(50,10)
self.featCondDister[key] = distr
# years of experience
key = ("1", 5)
distr = GaussianRejectSampler(15,3)
self.featCondDister[key] = distr
key = ("0", 5)
distr = GaussianRejectSampler(5,1)
self.featCondDister[key] = distr
# number of years in current job
key = ("1", 6)
distr = GaussianRejectSampler(3,.5)
self.featCondDister[key] = distr
key = ("0", 6)
distr = GaussianRejectSampler(1,.2)
self.featCondDister[key] = distr
# outstanding debt
key = ("1", 7)
distr = GaussianRejectSampler(20,5)
self.featCondDister[key] = distr
key = ("0", 7)
distr = GaussianRejectSampler(60,10)
self.featCondDister[key] = distr
# loan amount
key = ("1", 8)
distr = GaussianRejectSampler(300,50)
self.featCondDister[key] = distr
key = ("0", 8)
distr = GaussianRejectSampler(600,50)
self.featCondDister[key] = distr
# loan term
key = ("1", 9)
distr = CategoricalRejectSampler(("7", 100), ("15", 40), ("30", 60))
self.featCondDister[key] = distr
key = ("0", 9)
distr = CategoricalRejectSampler(("7", 30), ("15", 100), ("30", 60))
self.featCondDister[key] = distr
# credit score
key = ("1", 10)
distr = GaussianRejectSampler(700,20)
self.featCondDister[key] = distr
key = ("0", 10)
distr = GaussianRejectSampler(500,50)
self.featCondDister[key] = distr
if self.addExtra:
# saving
key = ("1", 11)
distr = NormalSampler(80,10)
self.featCondDister[key] = distr
key = ("0", 11)
distr = NormalSampler(60,8)
self.featCondDister[key] = distr
# retirement
zDistr = NormalSampler(0, 0)
key = ("1", 12)
sDistr = DiscreteRejectSampler(0,1,1,20,80)
nzDistr = NormalSampler(100,20)
distr = DistrMixtureSampler(sDistr, zDistr, nzDistr)
self.featCondDister[key] = distr
key = ("0", 12)
sDistr = DiscreteRejectSampler(0,1,1,50,50)
nzDistr = NormalSampler(40,10)
distr = DistrMixtureSampler(sDistr, zDistr, nzDistr)
self.featCondDister[key] = distr
#num of prior mortgae loans
key = ("1", 13)
distr = DiscreteRejectSampler(0,3,1,20,60,40,15)
self.featCondDister[key] = distr
key = ("0", 13)
distr = DiscreteRejectSampler(0,1,1,70,30)
self.featCondDister[key] = distr
def generateTwo(self, noise, keyLen, addExtra):
"""
ancestral sampling
"""
self.addExtra = addExtra
self.initTwo()
#error
erDistr = GaussianRejectSampler(0, noise)
#sampler
numChildren = NFEAT_EXT if self.addExtra else NFEAT
sampler = AncestralSampler(self.approvDistr, self.featCondDister, numChildren)
for i in range(self.numLoans):
(claz, features) = sampler.sample()
# add noise
features[4] = int(features[4])
features[7] = int(features[7])
features[8] = int(features[8])
features[10] = int(features[10])
if self.addExtra:
features[11] = int(features[11])
features[12] = int(features[12])
claz = addNoiseCat(claz, ["0", "1"], noise)
strFeatures = list(map(lambda f: toStr(f, 2), features))
rec = genID(keyLen) + "," + ",".join(strFeatures) + "," + claz
print (rec)
def encodeDummy(self, fileName, extra):
"""
dummy var encoding
"""
catVars = {}
catVars[1] = self.marStatus
catVars[10] = self.loanTerm
rSize = NFEAT_EXT if extra else NFEAT
rSize += 2
dummyVarGen = DummyVarGenerator(rSize, catVars, "1", "0", ",")
for row in fileRecGen(fileName, None):
newRow = dummyVarGen.processRow(row)
print (newRow)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--op', type=str, default = "none", help = "operation")
parser.add_argument('--nloan', type=int, default = 1000, help = "nom of loans")
parser.add_argument('--noise', type=float, default = 0.1, help = "nom of loans")
parser.add_argument('--klen', type=int, default = 1000, help = "key length")
parser.add_argument('--fpath', type=str, default = "none", help = "source file path")
parser.add_argument('--algo', type=str, default = "none", help = "source file path")
args = parser.parse_args()
op = args.op
if op == "gen":
""" generate data """
numLoans = args.nloan
loan = LoanApprove(numLoans)
noise = args.noise
keyLen = args.klen
addExtra = True
loan.generateTwo(noise, keyLen, addExtra)
elif op == "encd":
""" encode binary """
fileName = args.fpath
extra = True
loan = LoanApprove()
loan.encodeDummy(fileName, extra)
elif op == "fsel":
""" feature select """
fpath = args.fpath
algo = args.algo
expl = DataExplorer(False)
expl.addFileNumericData(fpath, 5, 8, 11, 12, "income", "debt", "crscore", "saving")
expl.addFileCatData(fpath, 3, 4, 15, "education", "selfemp", "target")
fdt = ["education", "cat", "selfemp", "cat", "income", "num", "debt", "num", "crscore", "num"]
tdt = ["target", "cat"]
if args.algo == "mrmr":
res = expl.getMaxRelMinRedFeatures(fdt, tdt, 3)
elif args.algo == "jmi":
res = expl.getJointMutInfoFeatures(fdt, tdt, 3)
elif args.algo == "cmim":
res = expl.getCondMutInfoMaxFeatures(fdt, tdt, 3)
elif args.algo == "icap":
res = expl.getInteractCapFeatures(fdt, tdt, 3)
elif args.algo == "infg":
res = expl.getInfoGainFeatures(fdt, tdt, 3, 8)
print(res)
else:
exitWithMsg("invalid command")