public static DoubleMatrix Encode(
TextReader reader,
char columnDelimiter,
IndexCollection extractedColumns,
bool firstLineContainsVariableNames,
Dictionary<int, Codifier> specialCodifiers,
IFormatProvider provider
)
Public Shared Function Encode (
reader As TextReader,
columnDelimiter As Char,
extractedColumns As IndexCollection,
firstLineContainsVariableNames As Boolean,
specialCodifiers As Dictionary(Of Integer, Codifier),
provider As IFormatProvider
) As DoubleMatrix
public:
static DoubleMatrix^ Encode(
TextReader^ reader,
wchar_t columnDelimiter,
IndexCollection^ extractedColumns,
bool firstLineContainsVariableNames,
Dictionary<int, Codifier^>^ specialCodifiers,
IFormatProvider^ provider
)
static member Encode :
reader : TextReader *
columnDelimiter : char *
extractedColumns : IndexCollection *
firstLineContainsVariableNames : bool *
specialCodifiers : Dictionary<int, Codifier> *
provider : IFormatProvider -> DoubleMatrix
Data Extraction
Each line from the stream is interpreted as the information about variables observed at a given instance. A line is split in tokens, each corresponding to a (zero-based) column, which in turn stores the data of a given variable. Columns are assumed to be separated each other by the character passed as columnDelimiter. Data from a variable are extracted only if the corresponding column index is in the collection extractedColumns.
Special Codification
By default, tokens in a column are interpreted as numerical data, which are inserted in the matrix using ToDouble(String, IFormatProvider). This behavior can be overridden by mapping a special codifier to a given column by inserting, in the dictionary specialCodifiers, the codifier as a value keyed with the index of the column whose data are to be transformed. A special codifier can be useful if a given column corresponds to a categorical variable whose labels must be represented via numerical codes in the returned matrix.
In the following example, a stream is read to encode data into a matrix. The stream contains two columns, the first corresponding to a variable representing time instants, and the second to a numerical one. A special codifier is assigned to the first column to define codes for time representations.
using System;
using System.Collections.Generic;
using System.Globalization;
using System.IO;
namespace Novacta.Analytics.CodeExamples
{
public class MatrixEncodeExample0
{
public void Main()
{
// Create a data stream.
string[] data = [
"TIME,NUMBER",
"20200410 09:42:00.000 +00:00, -2.2",
"20210511 11:51:00.010 +00:00, 0.0",
"20220612 15:11:31.200 +00:00, -3.3",
"20230713 17:32:10.749 +00:00, -1.1",
"20240814 09:42:00.150 +00:00, 4.4" ];
MemoryStream stream = new();
StreamWriter writer = new(stream);
for (int i = 0; i < data.Length; i++)
{
writer.WriteLine(data[i].ToCharArray());
writer.Flush();
}
stream.Position = 0;
// Define a special codifier for variable TIME
// using a local function.
static double timeCodifier(string token, IFormatProvider provider)
{
double datum = DateTimeOffset.ParseExact(
input: token,
format: "yyyyMMdd HH:mm:ss.fff zzz",
formatProvider: provider).ToUnixTimeMilliseconds();
return datum;
}
// Attach the special codifier to variable TIME.
int numberColumnIndex = 0;
var specialCodifiers = new Dictionary<int, Codifier>
{
{ numberColumnIndex, timeCodifier }
};
// Encode the matrix.
StreamReader streamReader = new(stream);
char columnDelimiter = ',';
IndexCollection extractedColumns = IndexCollection.Range(0, 1);
bool firstLineContainsColumnHeaders = true;
DoubleMatrix matrix = DoubleMatrix.Encode(
streamReader,
columnDelimiter,
extractedColumns,
firstLineContainsColumnHeaders,
specialCodifiers,
CultureInfo.InvariantCulture);
// Show the matrix.
Console.WriteLine("Encoded matrix:");
Console.WriteLine();
Console.Write(matrix);
Console.WriteLine();
// Decode variable TIME.
Console.WriteLine("Decoded variable TIME:");
Console.WriteLine();
var time = matrix[":", 0];
for (int i = 0; i < time.Count; i++)
{
Console.WriteLine(
"Time {0}: {1}",
i,
DateTimeOffset
.FromUnixTimeMilliseconds(Convert.ToInt64(time[i]))
.ToString("yyyyMMdd HH:mm:ss.fff zzz"));
}
}
}
}
// Executing method Main() produces the following output:
//
// Encoded matrix:
//
// [TIME] [NUMBER]
// 1.58651172e+12 -2.2
// 1.62073386e+12 0
// 1.65504669e+12 -3.3
// 1.68926953e+12 -1.1
// 1.72362852e+12 4.4
//
//
// Decoded variable TIME:
//
// Time 0: 20200410 09:42:00.000 +00:00
// Time 1: 20210511 11:51:00.010 +00:00
// Time 2: 20220612 15:11:31.200 +00:00
// Time 3: 20230713 17:32:10.749 +00:00
// Time 4: 20240814 09:42:00.150 +00:00
ArgumentNullException | reader is null. -or- extractedColumns is null. -or- specialCodifiers is null. -or- provider is null. |
ArgumentException | specialCodifiers contains null values or keys which are not in the extractedColumns collection. |
InvalidDataException |
There are no data rows in the stream accessed by
reader. -or- There is at least a row which contains not enough data for any column specified by extractedColumns. This can happen if there are missing columns, or if tokens extracted from the stream are null or consist only of white-space characters. |