diff --git a/testML/Program.cs b/testML/Program.cs index 546c671..6c48a35 100644 --- a/testML/Program.cs +++ b/testML/Program.cs @@ -7,15 +7,18 @@ using Microsoft.SqlServer.Server; using NPOI.XSSF.UserModel; using System; using System.Collections.Generic; +using System.Diagnostics; using System.Dynamic; using System.Globalization; using System.IO; using System.Linq; using System.Reflection; using System.Security.AccessControl; +using System.Security.Cryptography; using System.Text; using System.Threading.Tasks; using System.Xml.Linq; +using static TorchSharp.torch.utils; namespace testML { @@ -98,20 +101,16 @@ namespace testML { rowData.Add(finalColumnName, value?.ToString() ?? ""); } - - - } tmpData.Add(rowData); } - - var columnToPredict = "DESCENDIENTE_S4i001"; - + //Eliminamos las columnas en blanco var firstRow = tmpData[0] as IDictionary; foreach (var key in firstRow.Keys.ToArray()) { - var firstValue = (from x in tmpData where x.ContainsKey(key) && x[key] != null && !string.IsNullOrEmpty(x[key] as string) select x[key]).FirstOrDefault(); + var values = (from x in tmpData where x.ContainsKey(key) && x[key] != null && !string.IsNullOrEmpty(x[key] as string) select x[key]); + var firstValue = values.FirstOrDefault(); if (firstValue == null) { foreach (var item in tmpData) @@ -121,10 +120,66 @@ namespace testML item.Remove(key); } } + } + } + + foreach (var key in firstRow.Keys) + { + if (key.StartsWith("DESCENDIENTE_S4i") || + key.StartsWith("DESCENDIENTE_SNP")) + { + var values = (from x in tmpData where x.ContainsKey(key) && x[key] != null && !string.IsNullOrEmpty(x[key] as string) select x[key]).Distinct().ToArray(); + if (values.Length > 1) + { + + try + { + var sw = new Stopwatch(); + sw.Start(); + + MakePrediction(tmpData, key); + + sw.Stop(); + + Console.WriteLine("Elapsed: " + sw.Elapsed.ToString()); + + GC.Collect(); + } + catch (Exception ex) + { + Console.WriteLine(ex.ToString()); + } + } + else + { + + } } } - #endregion + Console.WriteLine(); + Console.WriteLine("Press enter to Exit"); + Console.ReadLine(); + } + + private static void MakePrediction(List> tmpData, string columnToPredict) + { + var firstRow = tmpData[0] as IDictionary; + + var hashKey = new StringBuilder(); + foreach (var key in firstRow.Keys.Where(x => !x.StartsWith("DESCENDIENTE_") && (x.Contains("_S4i") || x.Contains("_SNP"))).OrderBy(x => x)) + { + if (hashKey.Length > 0) { hashKey.Append("+"); } + hashKey.Append(key); + } + + var md5 = MD5.Create(); + var hash = string.Join("", md5.ComputeHash(new MemoryStream(new UTF8Encoding(false).GetBytes(hashKey.ToString()))).Select(x => x.ToString("X2").ToUpper()).ToArray()); + + var modelFilename = columnToPredict + "." + hash + ".zip"; + + + #endregion MLContext mlContext = new MLContext(); @@ -147,42 +202,48 @@ namespace testML //} }; + var dataConverted = DictionaryToObjectConverter.Convert(tmpData, columnToPredict, out Type classType, out Type classPredictionType, out DataViewSchema schema); - tmpData = null; //Liberamos la memoria - - - var loadMethod = mlContext.Data.GetType().GetMethods().Where(x => x.Name == "LoadFromEnumerable" && x.IsGenericMethodDefinition).FirstOrDefault(); - var loadMethodObj = loadMethod.MakeGenericMethod(classType); - var data = (IDataView)loadMethodObj.Invoke(mlContext.Data, new object[] { dataConverted, null }); - - #region Cortamos los datos de entrenamiento en (Datos para entenar y Datos para hacer el test de precisión) - - DataOperationsCatalog.TrainTestData dataSplit = mlContext.Data.TrainTestSplit(data, testFraction: 0.5); - IDataView trainData = dataSplit.TrainSet; - IDataView testData = dataSplit.TestSet; - - #endregion - - - #region Preparamos los datos de entrada y salida - - - var columnNameAndTypes = new Dictionary(); - foreach (var item in (from x in firstRow.Keys - select new { Key = x, Type = (from y in dataConverted.Cast() where y.GetValue(x) != null select y.GetValue(x).GetType()).FirstOrDefault() }) - ) + ITransformer _trainedModel; + if (!File.Exists(modelFilename)) { - columnNameAndTypes.Add(item.Key, item.Type); + var loadMethod = mlContext.Data.GetType().GetMethods().Where(x => x.Name == "LoadFromEnumerable" && x.IsGenericMethodDefinition).FirstOrDefault(); + var loadMethodObj = loadMethod.MakeGenericMethod(classType); + var data = (IDataView)loadMethodObj.Invoke(mlContext.Data, new object[] { dataConverted, null }); + + #region Cortamos los datos de entrenamiento en (Datos para entenar y Datos para hacer el test de precisión) + + DataOperationsCatalog.TrainTestData dataSplit = mlContext.Data.TrainTestSplit(data, testFraction: 0.5); + IDataView trainData = dataSplit.TrainSet; + IDataView testData = dataSplit.TestSet; + + #endregion + + + #region Preparamos los datos de entrada y salida + + + var columnNameAndTypes = new Dictionary(); + foreach (var item in (from x in firstRow.Keys + select new { Key = x, Type = (from y in dataConverted.Cast() where y.GetValue(x) != null select y.GetValue(x).GetType()).FirstOrDefault() }) + ) + { + columnNameAndTypes.Add(item.Key, item.Type); + } + + var pipeline = ProcessData(mlContext, columnToPredict, columnNameAndTypes); + var trainingPipeline = BuildAndTrainModel(mlContext, trainData, pipeline, classType, classPredictionType); + + Console.WriteLine("Training..."); + _trainedModel = trainingPipeline.Fit(trainData); + + mlContext.Model.Save(_trainedModel, data.Schema, modelFilename); + } + else + { + _trainedModel = mlContext.Model.Load(modelFilename, out schema); } - - var pipeline = ProcessData(mlContext, columnToPredict, columnNameAndTypes); - var trainingPipeline = BuildAndTrainModel(mlContext, trainData, pipeline, classType, classPredictionType); - - Console.WriteLine("Training..."); - var _trainedModel = trainingPipeline.Fit(trainData); - - mlContext.Model.Save(_trainedModel, data.Schema, columnToPredict + ".zip"); var createPredictionEngineMethod = mlContext.Model.GetType().GetMethods().Where(x => x.Name == "CreatePredictionEngine" && x.IsGenericMethodDefinition).FirstOrDefault(); var createPredictionEngineMethodObj = createPredictionEngineMethod.MakeGenericMethod(classType, classPredictionType); @@ -228,7 +289,6 @@ namespace testML Console.WriteLine(string.Format("Ok: {0}, Fail: {1}, Percent: {2}%", ok, fail, (((double)ok / (double)(ok + fail)) * 100.0).ToString("##0.0000"))); - #endregion /* @@ -266,13 +326,8 @@ namespace testML #endregion */ - Console.WriteLine(); - Console.WriteLine("Press enter to Exit"); - Console.ReadLine(); - } - private static IEstimator ProcessData(MLContext mlContext, string predictColumnName, Dictionary columnNames) { IEstimator pipeline = mlContext.Transforms.Conversion.MapValueToKey(inputColumnName: predictColumnName, outputColumnName: "Label"); @@ -300,7 +355,7 @@ namespace testML public static IEstimator BuildAndTrainModel(MLContext mlContext, IDataView trainingDataView, IEstimator pipeline, Type modelType, Type prodelPredictionType) { - var trainingPipeline = pipeline.Append(mlContext.MulticlassClassification.Trainers.SdcaMaximumEntropy("Label", "Features", maximumNumberOfIterations: 1000)) + var trainingPipeline = pipeline.Append(mlContext.MulticlassClassification.Trainers.SdcaMaximumEntropy("Label", "Features", maximumNumberOfIterations: 1000)) .Append(mlContext.Transforms.Conversion.MapKeyToValue("PredictedLabel")); return trainingPipeline;