I have a requirement where the file should be split using a given character.
Default splitting options are CRLF,LF and CR.
In these cases I am splitting the line by \r\n and \n and \r respectively.
Also I have requirement where any size of file should be processed.
(Processing is basically inserting the given string in a file at given position).
For this I am reading the file in chunk of 1024 bytes.
Then I am applying the string.Split() method.
Split() method gives options for ignoring white spaces and none.
I have to add back these line break characters to the line.
for this I am using a binary writer and I am writing the byte array to the new file.
Issue:-
1) When line break is CRLF, and the split option is NONE, while spaces are also added in the splitted array. Second option is given (to ignore white spaces) CRLF works properly.
2)Bit ignoring white space option creates other problems, as I am reading the file byte by byte I can’t ignore a white space.
3)When line break characters are other than default(e.g. ‘|’, a null value is prepended to the resulting line.
Can anybody give solution to my issues?
Here is the method I have written
private static void ProcessOriginalFile(string filePath, string destinationFilePath, int fromPosition, int toPosition, string lineBreakCharacter, string replaceString, long fileSize, bool ignoreHeader, bool ignoreFooter)
{
int chunkSize = fileSize < 1024 ? (int)fileSize : 1024;//bytes
byte[] chunkData = new byte[chunkSize];
char[] charactersSeparator=new char[1];
charactersSeparator = CommonOperations.ResolveTheRecordBreak(lineBreakCharacter);
int totalLineBreakCharactersLength = 0;
for (int i = 0; i < charactersSeparator.Length; i++)
{
if (charactersSeparator[i] == 0)
break;
totalLineBreakCharactersLength = totalLineBreakCharactersLength + BitConverter.GetBytes(charactersSeparator[i]).Length;
}
using (BinaryReader fileReader = new BinaryReader(new FileStream(filePath, FileMode.Open)))
{
string lastChunk = string.Empty;
string chunkcontents = string.Empty;
IList<byte[]> dataToBeWritten = new List<byte[]>();
while (fileSize > 0)
{
chunkData = fileReader.ReadBytes(chunkSize);
byte[] chunkToBeWritten = new byte[chunkSize];
chunkcontents = chunkcontents + System.Text.ASCIIEncoding.UTF8.GetString(chunkData);
string[] splittedArray = new string[1];
splittedArray = chunkcontents.Split(charactersSeparator, StringSplitOptions.RemoveEmptyEntries);
if (ignoreHeader)
{
splittedArray = splittedArray.Skip(1).ToArray();
ignoreHeader = false;
if (splittedArray.Count() == 0)
continue;
}
int count = 0;
if (splittedArray != null || splittedArray.Count() > 0)
{
foreach (string str in splittedArray)
{
++count;
if (count != splittedArray.Length)
{
string stringToBeEdited = string.Empty;
stringToBeEdited = str;
if (stringToBeEdited.Length < (toPosition + 1))
throw new Exception("Position exceeds the string length. Line contents are : " + stringToBeEdited + " String length is : " + stringToBeEdited.Length + " To Position is : " + toPosition);
//replace the text between from and two positions with the replaced string
stringToBeEdited = stringToBeEdited.Remove(fromPosition, toPosition + 1 - fromPosition).Insert(fromPosition, replaceString);
//clear the array
dataToBeWritten.Clear();
AddLineBreakCharacter(lineBreakCharacter, charactersSeparator, totalLineBreakCharactersLength, dataToBeWritten, ref chunkToBeWritten, ref stringToBeEdited);
//write data using binary writer
WriteBinaryDataToFile(destinationFilePath, dataToBeWritten);
totalLinesProcessed++;
}
else
{
lastChunk = str;
chunkcontents = str;
}
}
}
fileSize = fileSize - chunkSize;
/*if file size is less than the chunksize*2, then chunk size should be the file size
* and chunkdata array length should be that of the file size*/
if (fileSize < (chunkSize * 2))
{
chunkSize = (int)fileSize;
chunkData = new byte[chunkSize];
}
}
if (!string.IsNullOrEmpty(lastChunk))
{
if (ignoreFooter == false)
{
if (lastChunk.Length >= toPosition + 1)
lastChunk = lastChunk.Remove(fromPosition, toPosition + 1 - fromPosition).Insert(fromPosition, replaceString);
else
throw new Exception("Position exceeds in the last line of the file. Line contents are : " + lastChunk + " String length is : " + lastChunk.Length + " To Position is : " + toPosition);
dataToBeWritten.Clear();
byte[] chunkToBeWritten = new byte[chunkSize];
AddLineBreakCharacter(lineBreakCharacter, charactersSeparator, totalLineBreakCharactersLength, dataToBeWritten, ref chunkToBeWritten, ref lastChunk);
WriteBinaryDataToFile(destinationFilePath, dataToBeWritten);
totalLinesProcessed++;
}
}
}
}
private static void WriteBinaryDataToFile(string destinationFilePath, IList<byte[]> chunkToBeWritten)
{
using (FileStream fileSream = new FileStream(destinationFilePath, FileMode.Append))
{
using (BinaryWriter outfile = new BinaryWriter(fileSream, Encoding.ASCII))
{
foreach (byte[] item in chunkToBeWritten)
{
outfile.Write(item);
}
}
}
}
private static void AddLineBreakCharacter(string lineBreakCharacter, char[] charactersSeparator, int totalLineBreakCharactersLength, IList<byte[]> dataToBeWritten, ref byte[] chunkToBeWritten, ref string stringToBeEdited)
{
switch (lineBreakCharacter)
{
case CommonConstants.NEW_LINE:
stringToBeEdited = stringToBeEdited + CommonConstants.CARRIAGE_RETURN_CHARACTER + CommonConstants.NEW_LINE_CHARACTER;
chunkToBeWritten = System.Text.Encoding.UTF8.GetBytes(stringToBeEdited);
//add modified line
dataToBeWritten.Add(chunkToBeWritten);
break;
case CommonConstants.LINE_FEED:
stringToBeEdited = stringToBeEdited + CommonConstants.NEW_LINE_CHARACTER;
chunkToBeWritten = System.Text.Encoding.UTF8.GetBytes(stringToBeEdited);
//add modified line
dataToBeWritten.Add(chunkToBeWritten);
break;
case CommonConstants.CARRIAGE_RETURN:
stringToBeEdited = stringToBeEdited + CommonConstants.CARRIAGE_RETURN_CHARACTER;
chunkToBeWritten = System.Text.Encoding.UTF8.GetBytes(stringToBeEdited);
//add modified line
dataToBeWritten.Add(chunkToBeWritten);
break;
default:
chunkToBeWritten = System.Text.Encoding.UTF8.GetBytes(stringToBeEdited);
//add modified line
dataToBeWritten.Add(chunkToBeWritten);
byte[] seperatorCharArray = new byte[totalLineBreakCharactersLength];
int destinationIndex = 0;
foreach (char ch in charactersSeparator)
{
if (ch == 0)
break;
//convert the character to byte array
byte[] charArr = BitConverter.GetBytes(ch);
//copy character bytes to seperator character array
Array.Copy(charArr, 0, seperatorCharArray, destinationIndex, charArr.Length);
}
//add line break characters array to list
dataToBeWritten.Add(seperatorCharArray);
break;
}
}
I have solved the issue. I have simply dumped above code and used TextReader to read the file character by character. Simultaneously I have checked for line break character. As soon as I got it, I have processed the line.