using Microsoft.ML; using Microsoft.ML.AutoML; using Microsoft.ML.Data; using Microsoft.ML.Trainers; using Microsoft.ML.Transforms.Text; using Microsoft.SqlServer.Server; using NPOI.XSSF.UserModel; using System; using System.Collections.Generic; using System.Diagnostics; using System.Dynamic; using System.Globalization; using System.IO; using System.Linq; using System.Reflection; using System.Security.AccessControl; using System.Security.Cryptography; using System.Text; using System.Threading.Tasks; using System.Xml.Linq; using static TorchSharp.torch.utils; namespace testML { internal class Program { static Random rnd = new Random(); static void Main(string[] args) { XSSFWorkbook wb; //using (FileStream file = new FileStream(@"C:\Users\miguel.maldonado\Downloads\entrenar_IAMenos.xlsx", FileMode.Open, FileAccess.Read)) using (FileStream file = new FileStream(@"C:\Users\miguel.maldonado\Downloads\entrenar_IA.xlsx", FileMode.Open, FileAccess.Read)) { wb = new XSSFWorkbook(file); } var sheet = wb.GetSheetAt(0); var headerRow = sheet.GetRow(0); #region Preparamos los datos de entrenamiento var tmpData = new List>(); for (var r = 1; r < sheet.LastRowNum - 1; r++) { //if (r == 50) break; Console.WriteLine(string.Format("{0} / {1}", r, sheet.LastRowNum - 1)); var row = sheet.GetRow(r); var rowData = new Dictionary(); string prefix = string.Empty; for (var c = 0; c < headerRow.LastCellNum; c++) { var usePrefix = true; var columnName = headerRow.GetCell(c)?.StringCellValue; columnName = FixColumnName(columnName); object value = null; if (columnName == "PMASCULINO") { prefix = "MASCULINO_"; usePrefix = false; } if (columnName == "PFEMENINO") { prefix = "FEMENINO_"; usePrefix = false; } if (columnName == "DESCENDIENTE") { prefix = "DESCENDIENTE_"; usePrefix = false; } switch (row.GetCell(c)?.CellType) { case NPOI.SS.UserModel.CellType.Numeric: value = row.GetCell(c)?.NumericCellValue; break; case NPOI.SS.UserModel.CellType.String: value = row.GetCell(c)?.StringCellValue; break; } string valuePrefix = string.Empty; if (columnName.StartsWith("S4i") || columnName.StartsWith("SNP")) { valuePrefix = columnName + "_"; } var finalColumnName = (usePrefix ? prefix : string.Empty) + columnName; if (value is string) { rowData.Add(finalColumnName, valuePrefix + value); } else { rowData.Add(finalColumnName, value?.ToString() ?? ""); } } tmpData.Add(rowData); } //Eliminamos las columnas en blanco var firstRow = tmpData[0] as IDictionary; foreach (var key in firstRow.Keys.ToArray()) { var values = (from x in tmpData where x.ContainsKey(key) && x[key] != null && !string.IsNullOrEmpty(x[key] as string) select x[key]); var firstValue = values.FirstOrDefault(); if (firstValue == null) { foreach (var item in tmpData) { if (item.ContainsKey(key)) { item.Remove(key); } } } } foreach (var key in firstRow.Keys) { if (key.StartsWith("DESCENDIENTE_S4i") || key.StartsWith("DESCENDIENTE_SNP")) { var values = (from x in tmpData where x.ContainsKey(key) && x[key] != null && !string.IsNullOrEmpty(x[key] as string) select x[key]).Distinct().ToArray(); if (values.Length > 1) { try { var sw = new Stopwatch(); sw.Start(); MakePrediction(tmpData, key); sw.Stop(); Console.WriteLine("Elapsed: " + sw.Elapsed.ToString()); GC.Collect(); } catch (Exception ex) { Console.WriteLine(ex.ToString()); } } else { } } } Console.WriteLine(); Console.WriteLine("Press enter to Exit"); Console.ReadLine(); } private static void MakePrediction(List> tmpData, string columnToPredict) { var firstRow = tmpData[0] as IDictionary; var hashKey = new StringBuilder(); foreach (var key in firstRow.Keys.Where(x => !x.StartsWith("DESCENDIENTE_") && (x.Contains("_S4i") || x.Contains("_SNP"))).OrderBy(x => x)) { if (hashKey.Length > 0) { hashKey.Append("+"); } hashKey.Append(key); } var md5 = MD5.Create(); var hash = string.Join("", md5.ComputeHash(new MemoryStream(new UTF8Encoding(false).GetBytes(hashKey.ToString()))).Select(x => x.ToString("X2").ToUpper()).ToArray()); var modelFilename = columnToPredict + "." + hash + ".zip"; #endregion MLContext mlContext = new MLContext(); mlContext.Log += (_, e) => { if (e.Kind == Microsoft.ML.Runtime.ChannelMessageKind.Trace && e.Source.EndsWith(" Cursor")) { return; } if (e.Kind == Microsoft.ML.Runtime.ChannelMessageKind.Trace && e.Source.EndsWith(" CursorSplitter")) { return; } if (e.Kind == Microsoft.ML.Runtime.ChannelMessageKind.Trace && e.Source.EndsWith(" Consolidate")) { return; } if (e.Kind == Microsoft.ML.Runtime.ChannelMessageKind.Trace && e.Source.EndsWith(" Training")) { return; } if (e.Kind == Microsoft.ML.Runtime.ChannelMessageKind.Trace && e.Source.Equals("RangeFilter; Checking parameters")) { return; } //if (e.Source.Equals("AutoMLExperiment")) //{ Console.WriteLine(e.RawMessage); //} }; var dataConverted = DictionaryToObjectConverter.Convert(tmpData, columnToPredict, out Type classType, out Type classPredictionType, out DataViewSchema schema); ITransformer _trainedModel; if (!File.Exists(modelFilename)) { var loadMethod = mlContext.Data.GetType().GetMethods().Where(x => x.Name == "LoadFromEnumerable" && x.IsGenericMethodDefinition).FirstOrDefault(); var loadMethodObj = loadMethod.MakeGenericMethod(classType); var data = (IDataView)loadMethodObj.Invoke(mlContext.Data, new object[] { dataConverted, null }); #region Cortamos los datos de entrenamiento en (Datos para entenar y Datos para hacer el test de precisión) DataOperationsCatalog.TrainTestData dataSplit = mlContext.Data.TrainTestSplit(data, testFraction: 0.5); IDataView trainData = dataSplit.TrainSet; IDataView testData = dataSplit.TestSet; #endregion #region Preparamos los datos de entrada y salida var columnNameAndTypes = new Dictionary(); foreach (var item in (from x in firstRow.Keys select new { Key = x, Type = (from y in dataConverted.Cast() where y.GetValue(x) != null select y.GetValue(x).GetType()).FirstOrDefault() }) ) { columnNameAndTypes.Add(item.Key, item.Type); } var pipeline = ProcessData(mlContext, columnToPredict, columnNameAndTypes); var trainingPipeline = BuildAndTrainModel(mlContext, trainData, pipeline, classType, classPredictionType); Console.WriteLine("Training..."); _trainedModel = trainingPipeline.Fit(trainData); mlContext.Model.Save(_trainedModel, data.Schema, modelFilename); } else { _trainedModel = mlContext.Model.Load(modelFilename, out schema); } var createPredictionEngineMethod = mlContext.Model.GetType().GetMethods().Where(x => x.Name == "CreatePredictionEngine" && x.IsGenericMethodDefinition).FirstOrDefault(); var createPredictionEngineMethodObj = createPredictionEngineMethod.MakeGenericMethod(classType, classPredictionType); var _predEngine = createPredictionEngineMethodObj.Invoke(mlContext.Model, new object[] { _trainedModel, null, null, null }); //Test var predictMethod = _predEngine.GetType().GetMethods().Where(x => x.Name == "Predict" && x.GetParameters().Length == 1 && x.GetParameters()[0].ParameterType == classType).FirstOrDefault(); var ok = 0; var fail = 0; foreach (var item in dataConverted.Cast()) { var expected = item.GetValue(columnToPredict); if (expected == null || string.IsNullOrEmpty(expected as string)) { continue; } item.SetValue(columnToPredict, null); var prediction = predictMethod.Invoke(_predEngine, new object[] { item }) as IDictionaryToObjectConverter; var predicted = prediction.GetValue(columnToPredict); if (expected is string a && predicted is string b) { Console.Write(item.GetValue("DESCENDIENTE") ?? string.Empty); Console.Write(": "); Console.Write(string.Format("Expected: {0}\t\tPredicted: {1}", a, b)); if (string.Equals(a, b)) { ok++; Console.WriteLine("\tOk"); } else { fail++; Console.WriteLine("\tERROR!!!"); } } } Console.WriteLine(string.Format("Ok: {0}, Fail: {1}, Percent: {2}%", ok, fail, (((double)ok / (double)(ok + fail)) * 100.0).ToString("##0.0000"))); #endregion /* //Entrenamos el modelo //ITransformer model = pipe.Fit(trainData); #region Hacemos un test para medir el % de error // Use trained model to make inferences on test data IDataView testDataPredictions = result.Model.Transform(testData); // Extract model metrics and get RSquared RegressionMetrics trainedModelMetrics = mlContext.Regression.Evaluate(testDataPredictions, labelColumnName: columnInference.LabelColumnName); double rSquared = trainedModelMetrics.RSquared; Console.WriteLine("ModelMetrics: {0}", rSquared); #endregion #region Ponemos a prueba haciendo algunas predicciones var predictionFunction = mlContext.Model.CreatePredictionEngine(result.Model); for (var c = 0; c < 25; c++) { var test = CreateRandomData(); var expected = test.IntegerNumber; test.IntegerNumber = 0; var p = predictionFunction.Predict(test); Console.WriteLine("Found: {0:#,##0.00}\tExpected: {1:#,##0.00}\t\tDiff: {2:#,##0.00}", p.IntegerNumber, expected, expected - p.IntegerNumber); } #endregion */ } private static IEstimator ProcessData(MLContext mlContext, string predictColumnName, Dictionary columnNames) { IEstimator pipeline = mlContext.Transforms.Conversion.MapValueToKey(inputColumnName: predictColumnName, outputColumnName: "Label"); var featured = new List(); foreach (var key in columnNames.Keys) { if (key == predictColumnName) { continue; } if (key.StartsWith("DESCENDIENTE_")) { continue; } var type = columnNames[key]; if (type == typeof(string)) { pipeline = pipeline.Append(mlContext.Transforms.Text.FeaturizeText(inputColumnName: key, outputColumnName: key)); featured.Add(key); } } pipeline = pipeline.Append(mlContext.Transforms.Concatenate("Features", featured.ToArray())); return pipeline; } public static IEstimator BuildAndTrainModel(MLContext mlContext, IDataView trainingDataView, IEstimator pipeline, Type modelType, Type prodelPredictionType) { var trainingPipeline = pipeline.Append(mlContext.MulticlassClassification.Trainers.SdcaMaximumEntropy("Label", "Features", maximumNumberOfIterations: 1000)) .Append(mlContext.Transforms.Conversion.MapKeyToValue("PredictedLabel")); return trainingPipeline; } private static string FixColumnName(string columnName) { var result = new StringBuilder(columnName.Length); foreach (var c in columnName) { if (c == 'º' || c == 'ª') { continue; } if (char.IsLetter(c) || char.IsNumber(c) || (c == '_')) { result.Append(c); } } return result.ToString(); } private static Data CreateRandomData() { var d = new Data() { Accession = rnd.Next(0, 99999999).ToString("00000000"), Enum1 = rnd.Next(1, 4), Enum2 = rnd.Next(1, 11), Enum3 = rnd.Next(1, 6), Enum4 = rnd.Next(1, 6), // StringTest = tags[rnd.Next(0, tags.Length)] }; d.Enum4 = d.Enum1 + d.Enum2; // Ponemos algunos datos que tengan alguna relación (la red neuronal debería calibrarse para comprender esta formula) d.IntegerNumber = (((d.Enum1 + d.Enum2) - (d.Enum3 + d.Enum4)) * 5.25f) + d.StringTest.Length; d.DecimalNumber = (d.Enum2 / d.Enum1) * (2.0f + (1.0f / d.StringTest.Length)); if (d.StringTest == "Azul") { d.IntegerNumber += 10; d.OrigenResultNumber = 1; } if (d.StringTest == "Rojo") { d.IntegerNumber += 5f; d.OrigenResultNumber = 1; } return d; } } }