Spaces:

ThirdEyeData
/

Customer-Conversion-Prediction

Runtime error

File size: 7,663 Bytes

3d0e51e

#!/usr/local/bin/python3

# Author: Pranab Ghosh
# 
# Licensed under the Apache License, Version 2.0 (the "License"); you
# may not use this file except in compliance with the License. You may
# obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0 
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied. See the License for the specific language governing
# permissions and limitations under the License.

# Package imports
import os
import sys
import random
import statistics 
import matplotlib.pyplot as plt 
import argparse
from matumizi.util import *
from matumizi.mlutil import *
from matumizi.daexp import *
from matumizi.sampler import *

NFEAT = 11
NFEAT_EXT = 14

class LoanApprove:
	def __init__(self, numLoans=None):
		self.numLoans = numLoans
		self.marStatus = ["married", "single", "divorced"]
		self.loanTerm = ["7", "15", "30"]
		self.addExtra = False


	def initTwo(self):
		"""
		initialize samplers
		"""
		self.approvDistr = CategoricalRejectSampler(("1", 60), ("0", 40))
		self.featCondDister = {}
		
		#marital status
		key = ("1", 0)
		distr = CategoricalRejectSampler(("married", 100), ("single", 60), ("divorced", 40))
		self.featCondDister[key] = distr
		key = ("0", 0)
		distr = CategoricalRejectSampler(("married", 40), ("single", 100), ("divorced", 40))
		self.featCondDister[key] = distr
	
		
		# num of children
		key = ("1", 1)
		distr = CategoricalRejectSampler(("1", 100), ("2", 90), ("3", 40))
		self.featCondDister[key] = distr
		key = ("0", 1)
		distr = CategoricalRejectSampler(("1", 50), ("2", 70), ("3", 100))
		self.featCondDister[key] = distr

		# education
		key = ("1", 2)
		distr = CategoricalRejectSampler(("1", 30), ("2", 80), ("3", 100))
		self.featCondDister[key] = distr
		key = ("0", 2)
		distr = CategoricalRejectSampler(("1", 100), ("2", 40), ("3", 30))
		self.featCondDister[key] = distr

		#self employed
		key = ("1", 3)
		distr = CategoricalRejectSampler(("1", 40), ("0", 100))
		self.featCondDister[key] = distr
		key = ("0", 3)
		distr = CategoricalRejectSampler(("1", 100), ("0", 30))
		self.featCondDister[key] = distr
		
		# income
		key = ("1", 4)
		distr = GaussianRejectSampler(120,15)
		self.featCondDister[key] = distr
		key = ("0", 4)
		distr = GaussianRejectSampler(50,10)
		self.featCondDister[key] = distr

		# years of experience
		key = ("1", 5)
		distr = GaussianRejectSampler(15,3)
		self.featCondDister[key] = distr
		key = ("0", 5)
		distr = GaussianRejectSampler(5,1)
		self.featCondDister[key] = distr

		# number of years in current job
		key = ("1", 6)
		distr = GaussianRejectSampler(3,.5)
		self.featCondDister[key] = distr
		key = ("0", 6)
		distr = GaussianRejectSampler(1,.2)
		self.featCondDister[key] = distr

		# outstanding debt
		key = ("1", 7)
		distr = GaussianRejectSampler(20,5)
		self.featCondDister[key] = distr
		key = ("0", 7)
		distr = GaussianRejectSampler(60,10)
		self.featCondDister[key] = distr
		
		# loan amount
		key = ("1", 8)
		distr = GaussianRejectSampler(300,50)
		self.featCondDister[key] = distr
		key = ("0", 8)
		distr = GaussianRejectSampler(600,50)
		self.featCondDister[key] = distr
		
		# loan term 
		key = ("1", 9)
		distr = CategoricalRejectSampler(("7", 100), ("15", 40), ("30", 60))
		self.featCondDister[key] = distr
		key = ("0", 9)
		distr = CategoricalRejectSampler(("7", 30), ("15", 100), ("30", 60))
		self.featCondDister[key] = distr
		
		# credit score
		key = ("1", 10)
		distr = GaussianRejectSampler(700,20)
		self.featCondDister[key] = distr
		key = ("0", 10)
		distr = GaussianRejectSampler(500,50)
		self.featCondDister[key] = distr
		
		if self.addExtra:
			# saving
			key = ("1", 11)
			distr = NormalSampler(80,10)
			self.featCondDister[key] = distr
			key = ("0", 11)
			distr = NormalSampler(60,8)
			self.featCondDister[key] = distr
			
			# retirement
			zDistr = NormalSampler(0, 0)
			key = ("1", 12)
			sDistr = DiscreteRejectSampler(0,1,1,20,80)
			nzDistr = NormalSampler(100,20)
			distr = DistrMixtureSampler(sDistr, zDistr, nzDistr)
			self.featCondDister[key] = distr
			key = ("0", 12)
			sDistr = DiscreteRejectSampler(0,1,1,50,50)
			nzDistr = NormalSampler(40,10)
			distr = DistrMixtureSampler(sDistr, zDistr, nzDistr)
			self.featCondDister[key] = distr
		
			#num of prior mortgae loans
			key = ("1", 13)
			distr = DiscreteRejectSampler(0,3,1,20,60,40,15)
			self.featCondDister[key] = distr
			key = ("0", 13)
			distr = DiscreteRejectSampler(0,1,1,70,30)
			self.featCondDister[key] = distr
			
		
	def generateTwo(self, noise, keyLen, addExtra):
		"""
		ancestral sampling
		"""
		self.addExtra = addExtra
		self.initTwo()
		
		#error
		erDistr = GaussianRejectSampler(0, noise)
	
		#sampler
		numChildren = NFEAT_EXT if self.addExtra else NFEAT
		sampler = AncestralSampler(self.approvDistr, self.featCondDister, numChildren)

		for i in range(self.numLoans):
			(claz, features) = sampler.sample()
		
			# add noise
			features[4] = int(features[4])
			features[7] = int(features[7])
			features[8] = int(features[8])
			features[10] = int(features[10])
			if self.addExtra:
				features[11] = int(features[11])
				features[12] = int(features[12])

			claz = addNoiseCat(claz, ["0", "1"], noise)

			strFeatures = list(map(lambda f: toStr(f, 2), features))
			rec =  genID(keyLen) + "," + ",".join(strFeatures) + "," + claz
			print (rec)

	def encodeDummy(self, fileName, extra):
		"""
		dummy var encoding
		"""
		catVars = {}
		catVars[1] = self.marStatus
		catVars[10] = self.loanTerm
		rSize = NFEAT_EXT if extra else NFEAT
		rSize += 2
		dummyVarGen = DummyVarGenerator(rSize, catVars, "1", "0", ",")
		for row in fileRecGen(fileName, None):
			newRow = dummyVarGen.processRow(row)
			print (newRow)

if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument('--op', type=str, default = "none", help = "operation")
	parser.add_argument('--nloan', type=int, default = 1000, help = "nom of loans")
	parser.add_argument('--noise', type=float, default = 0.1, help = "nom of loans")
	parser.add_argument('--klen', type=int, default = 1000, help = "key length")
	parser.add_argument('--fpath', type=str, default = "none", help = "source file path")
	parser.add_argument('--algo', type=str, default = "none", help = "source file path")
	args = parser.parse_args()
	op = args.op
	
	if op == "gen":
		"""  generate data """
		numLoans = args.nloan
		loan = LoanApprove(numLoans)
		noise = args.noise
		keyLen = args.klen
		addExtra = True 
		loan.generateTwo(noise, keyLen, addExtra)

	elif op == "encd":
		""" encode binary """
		fileName = args.fpath
		extra = True
		loan = LoanApprove()
		loan.encodeDummy(fileName, extra)
	
	
	elif op == "fsel":
		""" feature select  """
		fpath = args.fpath
		algo = args.algo
		expl = DataExplorer(False)
		expl.addFileNumericData(fpath, 5, 8, 11, 12, "income", "debt", "crscore", "saving")
		expl.addFileCatData(fpath, 3, 4, 15, "education", "selfemp", "target")
		
		fdt = ["education", "cat", "selfemp", "cat", "income", "num",  "debt", "num", "crscore", "num"]
		tdt = ["target", "cat"]
		if args.algo == "mrmr":
			res = expl.getMaxRelMinRedFeatures(fdt, tdt, 3)
		elif args.algo == "jmi":
			res = expl.getJointMutInfoFeatures(fdt, tdt, 3)
		elif args.algo == "cmim":
			res = expl.getCondMutInfoMaxFeatures(fdt, tdt, 3)
		elif args.algo == "icap":
			res = expl.getInteractCapFeatures(fdt, tdt, 3)
		elif args.algo == "infg":
			res = expl.getInfoGainFeatures(fdt, tdt, 3, 8)

		print(res)
	else:
		exitWithMsg("invalid command")