447 lines
13 KiB
C#
447 lines
13 KiB
C#
using Microsoft.ML;
|
|
using Microsoft.ML.AutoML;
|
|
using Microsoft.ML.Data;
|
|
using Microsoft.ML.Trainers;
|
|
using Microsoft.ML.Transforms.Text;
|
|
using Microsoft.SqlServer.Server;
|
|
using NPOI.XSSF.UserModel;
|
|
using System;
|
|
using System.Collections.Generic;
|
|
using System.Diagnostics;
|
|
using System.Dynamic;
|
|
using System.Globalization;
|
|
using System.IO;
|
|
using System.Linq;
|
|
using System.Reflection;
|
|
using System.Security.AccessControl;
|
|
using System.Security.Cryptography;
|
|
using System.Text;
|
|
using System.Text.RegularExpressions;
|
|
using System.Threading.Tasks;
|
|
using System.Xml.Linq;
|
|
using static TorchSharp.torch.utils;
|
|
|
|
namespace testML
|
|
{
|
|
internal class Program
|
|
{
|
|
static Random rnd = new Random();
|
|
|
|
static void Main(string[] args)
|
|
{
|
|
XSSFWorkbook wb;
|
|
//using (FileStream file = new FileStream(@"C:\Users\miguel.maldonado\Downloads\entrenar_IAMenos.xlsx", FileMode.Open, FileAccess.Read))
|
|
using (FileStream file = new FileStream(@"entrenar_IA (1).xlsx", FileMode.Open, FileAccess.Read))
|
|
{
|
|
wb = new XSSFWorkbook(file);
|
|
}
|
|
|
|
var sheet = wb.GetSheetAt(0);
|
|
|
|
var CRRow = sheet.GetRow(0);
|
|
var headerRow = sheet.GetRow(1);
|
|
|
|
|
|
|
|
#region Preparamos los datos de entrenamiento
|
|
|
|
var tmpData = new List<Dictionary<string, object>>();
|
|
|
|
for (var r = headerRow.RowNum + 1; r < sheet.LastRowNum - 1; r++)
|
|
{
|
|
//if (r == 50) break;
|
|
Console.WriteLine(string.Format("{0} / {1}", r, sheet.LastRowNum - 1));
|
|
var row = sheet.GetRow(r);
|
|
|
|
var rowData = new Dictionary<string, object>();
|
|
|
|
string prefix = string.Empty;
|
|
|
|
for (var c = 0; c < headerRow.LastCellNum; c++)
|
|
{
|
|
var usePrefix = true;
|
|
var columnName = headerRow.GetCell(c)?.StringCellValue;
|
|
var crCell = CRRow.GetCell(c)?.NumericCellValue;
|
|
|
|
columnName = FixColumnName(columnName);
|
|
|
|
object value = null;
|
|
|
|
if (columnName == "PMASCULINO")
|
|
{
|
|
prefix = "MASCULINO_";
|
|
usePrefix = false;
|
|
}
|
|
if (columnName == "PFEMENINO")
|
|
{
|
|
prefix = "FEMENINO_";
|
|
usePrefix = false;
|
|
}
|
|
if (columnName == "DESCENDIENTE")
|
|
{
|
|
prefix = "DESCENDIENTE_";
|
|
usePrefix = false;
|
|
}
|
|
|
|
|
|
switch (row.GetCell(c)?.CellType)
|
|
{
|
|
case NPOI.SS.UserModel.CellType.Numeric: value = row.GetCell(c)?.NumericCellValue; break;
|
|
case NPOI.SS.UserModel.CellType.String: value = row.GetCell(c)?.StringCellValue; break;
|
|
}
|
|
|
|
string valuePrefix = string.Empty;
|
|
if (columnName.StartsWith("S4i") || columnName.StartsWith("SNP"))
|
|
{
|
|
valuePrefix = columnName + "_";
|
|
}
|
|
|
|
var finalColumnName = (usePrefix ? prefix : string.Empty) + columnName;
|
|
|
|
if (crCell != null)
|
|
{
|
|
finalColumnName = finalColumnName + "_CR" + crCell.Value.ToString();
|
|
}
|
|
|
|
if (value is string)
|
|
{
|
|
rowData.Add(finalColumnName, valuePrefix + value);
|
|
}
|
|
else
|
|
{
|
|
rowData.Add(finalColumnName, value?.ToString() ?? "");
|
|
}
|
|
}
|
|
tmpData.Add(rowData);
|
|
}
|
|
|
|
//Eliminamos las columnas en blanco
|
|
var firstRow = tmpData[0] as IDictionary<string, object>;
|
|
foreach (var key in firstRow.Keys.ToArray())
|
|
{
|
|
var values = (from x in tmpData where x.ContainsKey(key) && x[key] != null && !string.IsNullOrEmpty(x[key] as string) select x[key]);
|
|
var firstValue = values.FirstOrDefault();
|
|
if (firstValue == null)
|
|
{
|
|
foreach (var item in tmpData)
|
|
{
|
|
if (item.ContainsKey(key))
|
|
{
|
|
item.Remove(key);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
foreach (var key in firstRow.Keys)
|
|
{
|
|
if (key.StartsWith("DESCENDIENTE_S4i") ||
|
|
key.StartsWith("DESCENDIENTE_SNP"))
|
|
{
|
|
var values = (from x in tmpData where x.ContainsKey(key) && x[key] != null && !string.IsNullOrEmpty(x[key] as string) select x[key]).Distinct().ToArray();
|
|
if (values.Length > 1)
|
|
{
|
|
|
|
try
|
|
{
|
|
var sw = new Stopwatch();
|
|
sw.Start();
|
|
|
|
MakePrediction(tmpData, key);
|
|
|
|
sw.Stop();
|
|
|
|
Console.WriteLine("Elapsed: " + sw.Elapsed.ToString());
|
|
|
|
GC.Collect();
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
Console.WriteLine(ex.ToString());
|
|
}
|
|
}
|
|
else
|
|
{
|
|
|
|
}
|
|
}
|
|
}
|
|
|
|
Console.WriteLine();
|
|
Console.WriteLine("Press enter to Exit");
|
|
Console.ReadLine();
|
|
}
|
|
|
|
private static void MakePrediction(List<Dictionary<string, object>> tmpData, string columnToPredict)
|
|
{
|
|
var regexCR = new Regex(@"_CR\d+");
|
|
|
|
var currentCR = regexCR.Match(columnToPredict).Groups[0].Value;
|
|
|
|
|
|
var firstRow = tmpData[0] as IDictionary<string, object>;
|
|
|
|
var hashKey = new StringBuilder();
|
|
foreach (var key in firstRow.Keys.Where(x => !x.StartsWith("DESCENDIENTE_") && (x.Contains("_S4i") || x.Contains("_SNP"))).OrderBy(x => x))
|
|
{
|
|
if(!key.Contains(currentCR))
|
|
{
|
|
continue;
|
|
}
|
|
|
|
if (hashKey.Length > 0) { hashKey.Append("+"); }
|
|
hashKey.Append(key);
|
|
}
|
|
|
|
var md5 = MD5.Create();
|
|
var hash = string.Join("", md5.ComputeHash(new MemoryStream(new UTF8Encoding(false).GetBytes(hashKey.ToString()))).Select(x => x.ToString("X2").ToUpper()).ToArray());
|
|
|
|
var modelFilename = columnToPredict + "." + hash + ".zip";
|
|
var objectFilename = columnToPredict + "." + hash + ".dll";
|
|
|
|
|
|
#endregion
|
|
|
|
|
|
MLContext mlContext = new MLContext();
|
|
|
|
|
|
mlContext.Log += (_, e) =>
|
|
{
|
|
if (e.Kind == Microsoft.ML.Runtime.ChannelMessageKind.Trace && e.Source.EndsWith(" Cursor")) { return; }
|
|
if (e.Kind == Microsoft.ML.Runtime.ChannelMessageKind.Trace && e.Source.EndsWith(" CursorSplitter")) { return; }
|
|
if (e.Kind == Microsoft.ML.Runtime.ChannelMessageKind.Trace && e.Source.EndsWith(" Consolidate")) { return; }
|
|
if (e.Kind == Microsoft.ML.Runtime.ChannelMessageKind.Trace && e.Source.EndsWith(" Training")) { return; }
|
|
|
|
if (e.Kind == Microsoft.ML.Runtime.ChannelMessageKind.Trace && e.Source.Equals("RangeFilter; Checking parameters")) { return; }
|
|
|
|
|
|
//if (e.Source.Equals("AutoMLExperiment"))
|
|
//{
|
|
Console.WriteLine(e.RawMessage);
|
|
//}
|
|
};
|
|
|
|
|
|
var dataConverted = DictionaryToObjectConverter.Convert(tmpData, columnToPredict, objectFilename, out Type classType, out Type classPredictionType, out DataViewSchema schema);
|
|
|
|
ITransformer _trainedModel;
|
|
if (!File.Exists(modelFilename))
|
|
{
|
|
var loadMethod = mlContext.Data.GetType().GetMethods().Where(x => x.Name == "LoadFromEnumerable" && x.IsGenericMethodDefinition).FirstOrDefault();
|
|
var loadMethodObj = loadMethod.MakeGenericMethod(classType);
|
|
var data = (IDataView)loadMethodObj.Invoke(mlContext.Data, new object[] { dataConverted, null });
|
|
|
|
#region Cortamos los datos de entrenamiento en (Datos para entenar y Datos para hacer el test de precisión)
|
|
|
|
DataOperationsCatalog.TrainTestData dataSplit = mlContext.Data.TrainTestSplit(data, testFraction: 0.5);
|
|
IDataView trainData = dataSplit.TrainSet;
|
|
IDataView testData = dataSplit.TestSet;
|
|
|
|
#endregion
|
|
|
|
|
|
#region Preparamos los datos de entrada y salida
|
|
|
|
|
|
var columnNameAndTypes = new Dictionary<string, Type>();
|
|
foreach (var item in (from x in firstRow.Keys
|
|
select new { Key = x, Type = (from y in dataConverted.Cast<IDictionaryToObjectConverter>() where y.GetValue(x) != null select y.GetValue(x).GetType()).FirstOrDefault() })
|
|
)
|
|
{
|
|
columnNameAndTypes.Add(item.Key, item.Type);
|
|
}
|
|
|
|
var pipeline = ProcessData(mlContext, columnToPredict, columnNameAndTypes);
|
|
var trainingPipeline = BuildAndTrainModel(mlContext, trainData, pipeline, classType, classPredictionType);
|
|
|
|
Console.WriteLine("Training...");
|
|
|
|
_trainedModel = trainingPipeline.Fit(trainData);
|
|
|
|
mlContext.Model.Save(_trainedModel, data.Schema, modelFilename);
|
|
}
|
|
else
|
|
{
|
|
_trainedModel = mlContext.Model.Load(modelFilename, out schema);
|
|
}
|
|
|
|
var createPredictionEngineMethod = mlContext.Model.GetType().GetMethods().Where(x => x.Name == "CreatePredictionEngine" && x.IsGenericMethodDefinition).FirstOrDefault();
|
|
var createPredictionEngineMethodObj = createPredictionEngineMethod.MakeGenericMethod(classType, classPredictionType);
|
|
var _predEngine = createPredictionEngineMethodObj.Invoke(mlContext.Model, new object[] { _trainedModel, null, null, null });
|
|
|
|
//Test
|
|
var predictMethod = _predEngine.GetType().GetMethods().Where(x => x.Name == "Predict" && x.GetParameters().Length == 1 && x.GetParameters()[0].ParameterType == classType).FirstOrDefault();
|
|
|
|
var ok = 0;
|
|
var fail = 0;
|
|
foreach (var item in dataConverted.Cast<IDictionaryToObjectConverter>())
|
|
{
|
|
var expected = item.GetValue(columnToPredict);
|
|
if (expected == null || string.IsNullOrEmpty(expected as string)) { continue; }
|
|
|
|
item.SetValue(columnToPredict, null);
|
|
|
|
var prediction = predictMethod.Invoke(_predEngine, new object[] { item }) as IDictionaryToObjectConverter;
|
|
|
|
var predicted = prediction.GetValue(columnToPredict);
|
|
|
|
if (expected is string a && predicted is string b)
|
|
{
|
|
Console.Write(item.GetValue("DESCENDIENTE") ?? string.Empty);
|
|
Console.Write(": ");
|
|
|
|
Console.Write(string.Format("Expected: {0}\t\tPredicted: {1}", a, b));
|
|
|
|
if (string.Equals(a, b))
|
|
{
|
|
ok++;
|
|
Console.WriteLine("\tOk");
|
|
}
|
|
else
|
|
{
|
|
fail++;
|
|
Console.WriteLine("\tERROR!!!");
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Console.WriteLine(string.Format("Ok: {0}, Fail: {1}, Percent: {2}%", ok, fail, (((double)ok / (double)(ok + fail)) * 100.0).ToString("##0.0000")));
|
|
|
|
#endregion
|
|
|
|
/*
|
|
//Entrenamos el modelo
|
|
//ITransformer model = pipe.Fit(trainData);
|
|
|
|
#region Hacemos un test para medir el % de error
|
|
|
|
// Use trained model to make inferences on test data
|
|
IDataView testDataPredictions = result.Model.Transform(testData);
|
|
|
|
// Extract model metrics and get RSquared
|
|
RegressionMetrics trainedModelMetrics = mlContext.Regression.Evaluate(testDataPredictions, labelColumnName: columnInference.LabelColumnName);
|
|
double rSquared = trainedModelMetrics.RSquared;
|
|
|
|
Console.WriteLine("ModelMetrics: {0}", rSquared);
|
|
|
|
#endregion
|
|
|
|
|
|
#region Ponemos a prueba haciendo algunas predicciones
|
|
|
|
var predictionFunction = mlContext.Model.CreatePredictionEngine<Data, DataPrediction>(result.Model);
|
|
|
|
for (var c = 0; c < 25; c++)
|
|
{
|
|
var test = CreateRandomData();
|
|
var expected = test.IntegerNumber;
|
|
test.IntegerNumber = 0;
|
|
|
|
var p = predictionFunction.Predict(test);
|
|
|
|
Console.WriteLine("Found: {0:#,##0.00}\tExpected: {1:#,##0.00}\t\tDiff: {2:#,##0.00}", p.IntegerNumber, expected, expected - p.IntegerNumber);
|
|
}
|
|
|
|
#endregion
|
|
*/
|
|
}
|
|
|
|
private static IEstimator<ITransformer> ProcessData(MLContext mlContext, string predictColumnName, Dictionary<string, Type> columnNames)
|
|
{
|
|
IEstimator<ITransformer> pipeline = mlContext.Transforms.Conversion.MapValueToKey(inputColumnName: predictColumnName, outputColumnName: "Label");
|
|
|
|
|
|
var featured = new List<string>();
|
|
|
|
foreach (var key in columnNames.Keys)
|
|
{
|
|
if (key == predictColumnName) { continue; }
|
|
if (key.StartsWith("DESCENDIENTE_")) { continue; }
|
|
|
|
var type = columnNames[key];
|
|
if (type == typeof(string))
|
|
{
|
|
pipeline = pipeline.Append(mlContext.Transforms.Text.FeaturizeText(inputColumnName: key, outputColumnName: key));
|
|
featured.Add(key);
|
|
}
|
|
}
|
|
|
|
pipeline = pipeline.Append(mlContext.Transforms.Concatenate("Features", featured.ToArray()));
|
|
|
|
return pipeline;
|
|
}
|
|
|
|
public static IEstimator<ITransformer> BuildAndTrainModel(MLContext mlContext, IDataView trainingDataView, IEstimator<ITransformer> pipeline, Type modelType, Type prodelPredictionType)
|
|
{
|
|
var trainingPipeline = pipeline.Append(mlContext.MulticlassClassification.Trainers.SdcaMaximumEntropy("Label", "Features", maximumNumberOfIterations: 1000))
|
|
.Append(mlContext.Transforms.Conversion.MapKeyToValue("PredictedLabel"));
|
|
|
|
return trainingPipeline;
|
|
}
|
|
|
|
|
|
private static string FixColumnName(string columnName)
|
|
{
|
|
var result = new StringBuilder(columnName.Length);
|
|
|
|
foreach (var c in columnName)
|
|
{
|
|
if (c == 'º' || c == 'ª')
|
|
{
|
|
continue;
|
|
}
|
|
|
|
if (char.IsLetter(c) ||
|
|
char.IsNumber(c) ||
|
|
(c == '_'))
|
|
{
|
|
result.Append(c);
|
|
}
|
|
}
|
|
|
|
return result.ToString();
|
|
}
|
|
|
|
private static Data CreateRandomData()
|
|
{
|
|
var d = new Data()
|
|
{
|
|
Accession = rnd.Next(0, 99999999).ToString("00000000"),
|
|
Enum1 = rnd.Next(1, 4),
|
|
Enum2 = rnd.Next(1, 11),
|
|
Enum3 = rnd.Next(1, 6),
|
|
Enum4 = rnd.Next(1, 6),
|
|
// StringTest = tags[rnd.Next(0, tags.Length)]
|
|
};
|
|
|
|
d.Enum4 = d.Enum1 + d.Enum2;
|
|
|
|
// Ponemos algunos datos que tengan alguna relación (la red neuronal debería calibrarse para comprender esta formula)
|
|
d.IntegerNumber = (((d.Enum1 + d.Enum2) - (d.Enum3 + d.Enum4)) * 5.25f) + d.StringTest.Length;
|
|
|
|
d.DecimalNumber = (d.Enum2 / d.Enum1) * (2.0f + (1.0f / d.StringTest.Length));
|
|
|
|
if (d.StringTest == "Azul")
|
|
{
|
|
d.IntegerNumber += 10;
|
|
d.OrigenResultNumber = 1;
|
|
}
|
|
|
|
if (d.StringTest == "Rojo")
|
|
{
|
|
d.IntegerNumber += 5f;
|
|
d.OrigenResultNumber = 1;
|
|
}
|
|
|
|
return d;
|
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
}
|