校验utf8数组是否是一个完整的字符编码

6/29/2024 7:43:44 PM
269
0

正在使用串口做上位机通信工程。为了方便调试在下位机中使用串口发送了非自定义协议消息和自定义协议两中消息,所以消息解析起来比较麻烦。

自定义协议消息格式:0x55 0xAA [len] [command]  [body]  CRC16(两个字节) 0xFF
大致的原理:

将串口所有消息放到一个byte[]  buffer中, 使用bufferIndex 作为游标指针,它总是指向到buffer的最后一个数据位+1的位置,即:buffer中第一个空白bit位。串口新接收的收据将从 bufferIndex 开始放置。
使用另外一个游标指针 processedIndex 来指向到下一个要被处理的数据位。通过移动processedIndex游标,判断是否遇到命令,如果遇到协议命令,则取出该协议命令的数据。否则将非协议命令的数据放到 infoBuffer中。
由于串口发送数据会在数据byte[]任意位置断开分多次发送。所以一次发送的数据可能不是完整的,部分字符会被分成两次发送到上位机。故此上位机接到数据后要判断你最后几位是否是一个完整的字符。

    public delegate void SerialMsgPrintDelegate(string message);
    public delegate void CommandDataMsgDelegate(byte command, byte[] message);

    /// <summary>
    /// 串口消息处理
    /// </summary>
    internal class SerialDataHandle
    {
        private byte[] infoBuffer= new byte[2048];
        private int infoBufferIndex = 0;

        private byte[] buffer = new byte[1024];

        /// <summary>
        /// buffer中可用的数据长度
        /// 它总是指向到最后一个byte的下一位
        /// </summary>
        private int bufferIndex = 0;     //当前buffer 指针位置,代表这当前buffer中可用的数据长度,定义他永远指向到下一个要被处理字节处(索引)


        public SerialMsgPrintDelegate SerialMsgEvent;
        public CommandDataMsgDelegate CommandDataMsgEvent;

        public void ProcessData(byte[] data)
        {
            Array.Copy(data, 0, buffer, bufferIndex, data.Length);
            bufferIndex += data.Length;
            int processedIndex = 0;        //计数器,指向待被处理的字节的索引

            while (processedIndex < bufferIndex - 6) // 命令帧至少包含7个字节(头x2,长度1,命令1,CRC16x2, 收尾0xFF),如果最小命令长度为7,则bufferIndex=7(指向到下一个待处理的字节)
            {

                if (buffer[processedIndex] == 0x55 && buffer[processedIndex + 1] == 0xAA)
                {
                    int length = buffer[processedIndex + 2];
                    int totalLength = length + 7;

                    if (bufferIndex - processedIndex >= totalLength)
                    {
                        byte command = buffer[processedIndex + 3];
                        byte[] payload = new byte[length];
                        Array.Copy(buffer, processedIndex + 4, payload, 0, length);

                        ushort receivedCrc16 = BitConverter.ToUInt16(buffer, processedIndex + 4 + length);
                        byte endMarker = buffer[processedIndex + totalLength - 1];

                        if (endMarker == 0xFF)
                        {
                            ushort computedCrc16 = CRC16_Check(buffer, processedIndex, length + 4);
                            if (receivedCrc16 == computedCrc16)
                            {
                                HandleCommandData(command, payload);
                            }
                        }
                        processedIndex += totalLength;
                    }
                    else
                    {
                        break;
                    }
                }
                else
                {
                    int nextStartIndex = processedIndex;
                    while (nextStartIndex < bufferIndex  && !(buffer[nextStartIndex] == 0x55 && buffer[nextStartIndex + 1] == 0xAA))
                    {
                        infoBuffer[infoBufferIndex] = buffer[nextStartIndex];
                        nextStartIndex++;
                        infoBufferIndex++;
                    }



                    if (nextStartIndex == bufferIndex)
                    {
                        byte[]  checkBit= new byte[6];
                        if (infoBufferIndex >=6)
                        {
                            Array.Copy(infoBuffer, infoBufferIndex-6, checkBit, 0, 6);
                        }
                        else
                        {
                            Array.Copy(infoBuffer, 0, checkBit, 6- infoBufferIndex, infoBufferIndex);
                        }

                         //当前数据遍历到最后,没有发现指令,但不一定是消息结尾。
                        if(IsCompleteUTF8Character(checkBit))
                        {
                            Console.WriteLine("<debug1>" + BitConverter.ToString(infoBuffer, 0, infoBufferIndex));
                            string message = Encoding.UTF8.GetString(infoBuffer,0, infoBufferIndex);
                            infoBufferIndex = 0;
                            HandleSerialPrint(message);
                        }
                        else
                        {
                            Array.Copy(buffer, processedIndex, infoBuffer, infoBufferIndex, bufferIndex- processedIndex);
                        }
                    }
                    else
                    {
                        Console.WriteLine("<debug2>" + BitConverter.ToString(infoBuffer, 0, infoBufferIndex));
                        string message = Encoding.UTF8.GetString(infoBuffer,0, infoBufferIndex);
                        infoBufferIndex = 0;
                        HandleSerialPrint(message);
                    }
                    processedIndex = nextStartIndex;
                }
            }

            if (processedIndex < bufferIndex)
            {
                Array.Copy(buffer, processedIndex, buffer, 0, bufferIndex - processedIndex);
                bufferIndex -= processedIndex;
            }
            else
            {
                bufferIndex = 0;
            }

        }

        private void HandleSerialPrint(string message)
        {
            // 处理普通的串口打印信息
            //Console.WriteLine(message);
            // 在这里添加打印信息的处理逻辑
            if (SerialMsgEvent != null)
            {
                SerialMsgEvent(message);
            }
        }



        private void HandleCommandData(byte command, byte[] data)
        {
            // 根据命令处理数据
            Console.WriteLine($"Command: {command}, Data: {BitConverter.ToString(data)}");
            // 在这里添加命令处理逻辑
            if (CommandDataMsgEvent != null)
            {
                CommandDataMsgEvent(command, data);
            }
        }


        /// <summary>
        /// 检查数组最后一个字节是否符合utf8规则,来判断数组最后字节是否是完整字符的字节
        /// </summary>
        /// <param name="lastByte">最后6位,如果不足则取全部</param>
        /// <param name="numBytes"></param>
        /// <returns>true  表示是完整的字节</returns>
        public static bool IsCompleteUTF8Character(byte[] byteArray)
        {
            // Start checking from the end of the array
            int length = byteArray.Length;
            int numBytes = 0;
            int affiliateByteNum = 0;

            // Counting bytes from the end
            for (int i = length - 1; i >= 0; i--)
            {
                byte currentByte = byteArray[i];

                if (i == length - 1)
                {
                    // Counting leading 1's in the high bits to determine byte length of the character
                    if ((currentByte & 0x80) == 0) // Found an ASCII character
                    {
                        return true; // First byte of a character found
                    }
                    else
                    {
                        if ((currentByte & 0xC0) == 0x80)      //判断是否是10开头的
                        {
                            affiliateByteNum++;
                        }
                        else
                        {
                            return false;    //不是字母,又不是10开头的,最后一个字节必然是utf8 字符的第一个字节,所以字符被切割
                        }
                    }
                }
                else
                {
                    // Check if current byte is a continuation byte (starts with 10xxxxxx)
                    if ((currentByte & 0xC0) == 0x80)
                    {
                        affiliateByteNum++;
                    }
                    else
                    {

                        // Count number of consecutive 1's in the high bits
                        byte mask = 0x80;
                        while ((currentByte & mask) != 0)
                        {
                            numBytes++;
                            mask >>= 1;
                        }

                        // If numBytes is 1 or greater than remaining bytes, it's invalid UTF-8 sequence
                        if (numBytes == affiliateByteNum+1)
                        {
                            return true;
                        }

                    }
                }
            }

            // If we iterated through all bytes and numBytes is still not zero, it's incomplete sequence
            return   false;
        }


        private ushort CRC16_Check(byte[] data, int start, int length)
        {
            ushort crc = 0xFFFF;
            for (int i = start; i < start + length; i++)
            {
                crc ^= data[i];
                for (int j = 0; j < 8; j++)
                {
                    if ((crc & 0x0001) != 0)
                    {
                        crc >>= 1;
                        crc ^= 0xA001;
                    }
                    else
                    {
                        crc >>= 1;
                    }
                }
            }
            return crc;
        }

    }

 

这个方法用于判断utf8  最后字节是否是完整的字符的字节。

格局utf8的编码规则,一个字符可以编码成1-6个字节,格式如下:

1字节 0xxxxxxx 
2字节 110xxxxx 10xxxxxx 
3字节 1110xxxx 10xxxxxx 10xxxxxx 
4字节 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 
5字节 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 
6字节 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 

如果一个字节的第一位是0,则这个字节单独就是一个字符;如果第一位是1,则连续有多少个1,就表示当前字符占用多少个字节。

所以获取数组最后6个字节,从后往前按这个规则判断即可。

/// <summary>
/// 检查数组最后6字节是否符合utf8规则,来判断数组最后字节是否是完整字符的字节
/// </summary>
/// <param name="lastByte">最后6位,如果不足则取全部</param>
/// <param name="numBytes"></param>
/// <returns>true  表示是完整的字节</returns>
public static bool IsCompleteUTF8Character(byte[] byteArray)
{
    // Start checking from the end of the array
    int length = byteArray.Length;
    int numBytes = 0;
    int affiliateByteNum = 0;

    // Counting bytes from the end
    for (int i = length - 1; i >= 0; i--)
    {
        byte currentByte = byteArray[i];

        if (i == length - 1)
        {
            // Counting leading 1's in the high bits to determine byte length of the character
            if ((currentByte & 0x80) == 0) // Found an ASCII character
            {
                return true; // First byte of a character found
            }
            else
            {
                if ((currentByte & 0xC0) == 0x80)      //判断是否是10开头的
                {
                    affiliateByteNum++;
                }
                else
                {
                    return false;    //不是字母,又不是10开头的,最后一个字节必然是utf8 字符的第一个字节,所以字符被切割
                }
            }
        }
        else
        {
            // Check if current byte is a continuation byte (starts with 10xxxxxx)
            if ((currentByte & 0xC0) == 0x80)
            {
                affiliateByteNum++;
            }
            else
            {

                // Count number of consecutive 1's in the high bits
                byte mask = 0x80;
                while ((currentByte & mask) != 0)
                {
                    numBytes++;
                    mask >>= 1;
                }

                // If numBytes is 1 or greater than remaining bytes, it's invalid UTF-8 sequence
                if (numBytes == affiliateByteNum+1)
                {
                    return true;
                }

            }
        }
    }

    // If we iterated through all bytes and numBytes is still not zero, it's incomplete sequence
    return   false;
}

全部评论



提问