问题描述
现在我想将文件编码格式更改为UTF格式-8使用Visual Basic 6.0。我不想修改原始文件;我只想改变编码格式。
我在网上搜索过;但是不能理解VB代码,不知道该怎么做。
我也希望能够从UTF- 8个编码文件。
注意。这个答案已经被广泛地扩展以适应编辑过的问题,反过来是由于
以下代码将将ANSI,UTF-16和UTF-32编码的字符串从文件转换为UTF-8字符串,在VB6中。您必须在整个文件中加载并输出。请注意,如果它是真正的通用的,LineInputUTF8()方法将是LineInput(),并且需要一个代码页。
选项显式
私有声明函数MultiByteToWideChar LibKernel32.dll(_
ByVal CodePage As Long,_
ByVal dwFlags As Long,_
ByVal lpMultiByteStr As Long, _
ByVal cbMultiByte As Long,_
ByVal lpWideCharStr As Long,_
ByVal cchWideChar As Long _
)As Long
私有声明函数WideCharToMultiByte Lib Kernel32.dll(_
ByVal CodePage As Long,_
ByVal dwFlags As Long,_
ByVal lpWideCharStr As Long,_
ByVal cchWideChar As Long,_
ByVal lpMultiByteStr As Long,_
ByVal cbMultiByte As Long,_
ByVal lpDefaultChar As Long,_
ByVal lpUsedDefaultChar As Long _
)As Long
Public Const CP_ACP As Long = 0'默认的ANSI代码页。
Public Const CP_UTF8 As Long = 65001'UTF8。
Public Const CP_UTF16_LE As Long = 1200'UTF16 - little endian。
Public Const CP_UTF16_BE As Long = 1201'UTF16 - big endian。
Public Const CP_UTF32_LE As Long = 12000'UTF32 - little endian。
Public Const CP_UTF32_BE As Long = 12001'UTF32 - big endian。
'目的:启发式确定文件中的字节是否为UTF-8。
私有函数FileBytesAreUTF8(ByVal the_iFileNo As Integer)As Boolean
Const knSampleByteSize As Long = 2048
Dim nLof As Long
Dim nByteCount As Long
Dim nByteIndex As Long
Dim nCharExtraByteCount As Long
Dim bytValue As Byte
'我们查看第一个< knSampleByteSize>文件的字节数。但是,如果文件较小,我们将必须
'使用较小的大小。
nLof = LOF(the_iFileNo)
如果nLof< knSampleByteSize然后
nByteCount = nLof
Else
nByteCount = knSampleByteSize
End If
'转到文件的开头。
寻求#the_iFileNo,1
对于nByteIndex = 1到nByteCount
获取#the_iFileNo,bytValue
'如果我们正在处理的字节超过1,那么我们就到了下一个字符。
如果nCharExtraByteCount = 0然后
'
'UTF-8规范说,字符的第一个字节具有掩码位,表示后面有多少个字节。
'
'参见:http://en.wikipedia.org/wiki/UTF-8#Description
'
'
'中的字节序列字节1字节2字节3字节4
'1 0xxxxxxx
'2 110xxxxx 10xxxxxx
'3 1110xxxx 10xxxxxx 10xxxxxx
'4 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
'
如果bytValue And& H80)=& H0然后
nCharExtraByteCount = 0
ElseIf(bytValue And& HE0)=& HC0然后
nCharExtraByteCount = 1
ElseIf(bytValue And & HF0)=&然后
nCharExtraByteCount = 2
ElseIf(bytValue And& HF8)=& HF0然后
nCharExtraByteCount = 3
Else
'如果没有这些掩码匹配,那么这不能是UTF-8的特征之三。
FileBytesAreUTF8 = False
退出函数
结束如果
Else
'所有后续字节必须像上表一样被屏蔽。
如果(bytValue And& HC0)=& H80然后
nCharExtraByteCount = nCharExtraByteCount - 1
如果nCharExtraByteCount = 0然后
FileBytesAreUTF8 = True
End If
Else
'不是UTF8字符。
FileBytesAreUTF8 = False
退出函数
结束如果
结束如果
下一个nByteIndex
结束函数
'目的:使用字节数组< the_abytCPString>中的字符串,代码页面< the_nCodePage>转换为VB字符串。
私有函数FromCPString(ByRef the_abytCPString()As Byte,ByVal the_nCodePage As Long)As String
Dim sOutput As String
Dim nValueLen As Long
Dim nOutputCharLen As Long
'如果代码页表示这已经与VB字符串兼容,那么只需将其复制到字符串中即可。不乱
如果the_nCodePage = CP_UTF16_LE然后
FromCPString = the_abytCPString()
Else
'缓存输入长度。
nValueLen = UBound(the_abytCPString) - LBound(the_abytCPString)+ 1
'查看输出缓冲区的大小。
nOutputCharLen = MultiByteToWideChar(the_nCodePage,0& VarPtr(the_abytCPString(LBound(the_abytCPString))),nValueLen,0&)
'将输出字节数组的大小调整为UTF-8字符串。
sOutput = Space $(nOutputCharLen)
'再次调用此API,此时给出一个指向输出字节数组的指针。
MultiByteToWideChar the_nCodePage,0& VarPtr(the_abytCPString(LBound(the_abytCPString))),nValueLen,StrPtr(sOutput),nOutputCharLen
'返回数组。
FromCPString = sOutput
如果
结束函数
公共函数GetContents(ByVal the_sTextFile As String,ByRef out_nCodePage As Long,Optional ByVal the_nDesiredCodePage As Long = -1,Optional ByRef out_bContainedBOM As Boolean)As String
Dim iFileNo As Integer
Dim abytFileContents()As Byte
Dim nDataSize As Long
iFileNo = FreeFile
OpenForInput the_sTextFile,iFileNo,out_nCodePage,the_nDesiredCodePage,out_bContainedBOM
'我们要读取文件的全部内容(不包括任何BOM值)。
'调用OpenForInput()后,文件指针应位于任何BOM之后。
'所以大小文件内容缓冲区到<文件大小> - <当前位置> + 1.
nDataSize = LOF(iFileNo) - Seek(iFileNo)+ 1
ReDim abytFileContents(1到nDataSize)
获取#iFileNo,abytFileContents()
关闭iFileNo
'现在我们必须将其转换为UTF-8。但是我们必须先转换为Windows NT标准的UTF-16 LE。
GetContents = FromCPString(abytFileContents(),out_nCodePage)
结束函数
'目的:读取文件当前行的末尾,重新定位到下一行的开头,如果有的话,
'输出所有找到的字符。
'输入:the_nFileNo文件的编号。
'输出:out_sLine文件中当前位置的行。
'返回:如果有更多数据,则为真。
公共函数LineInputUTF8(ByVal the_nFileNo As Integer,ByRef out_sLine As String)As Boolean
Dim bytValue As Byte
Dim abytLine()As Byte
Dim nStartOfLinePos As Long
Dim nEndOfLinePos As Long
Dim nStartOfNextLine As Long
Dim nLineLen As Long
'将当前文件位置保存为行的开头,并缓存此值。
nStartOfLinePos = Seek(the_nFileNo)
'从当前位置检索第一个字节。
获取#the_nFileNo,,bytValue
'循环直到遇到文件结尾。
直到EOF(the_nFileNo)
'检查此字节是否表示回车符或换行字符(表示新行)。
如果bytValue = 13或bytValue = 10然后
'至此,当前位置是*后面的CR或LF字符,所以要获得
'最后一个字节的位置行,我们必须回去两个字节。
nEndOfLinePos = Seek(the_nFileNo) - 2
'如果这是回车符,那么我们必须检查下一个字符。
如果bytValue = 13然后
获取#the_nFileNo,bytValue
'这是一个换行符?
如果bytValue = 10然后
'是的。假设CR-LF计数为单个NewLine。所以下一行的开始应该跳过换行。
nStartOfNextLine = nEndOfLinePos + 3
Else
'否下一行的开头是当前位置。
nStartOfNextLine = nEndOfLinePos + 2
End If
ElseIf bytValue = 10然后
'如果这是换行,则下一行的开始是当前位置。
nStartOfNextLine = nEndOfLinePos + 2
End If
'由于我们已经处理了行中的所有字节,请退出循环。
退出Do
结束如果
'获取下一个字节。
获取#the_nFileNo,bytValue
循环
'检查是否有一行结束。
如果nEndOfLinePos = 0然后
'否,这是文件的结尾 - 所以使用所有剩余的字符。
nLineLen = Seek(the_nFileNo) - nStartOfLinePos - 1
Else
'是 - 所以使用所有字符直到行尾位置。
nLineLen = nEndOfLinePos - nStartOfLinePos + 1
End If
'这行是空吗?
如果nLineLen = 0然后
'是 - 只返回一个空字符串。
out_sLine = vbNullString
Else
'否 - 将所有从字符串开始到结尾的字节都拉到字节数组中,然后将其从UTF-8转换为VB字符串。
ReDim abytLine(1到nLineLen)
获取#the_nFileNo,nStartOfLinePos,abytLine()
out_sLine = FromCPString(abytLine(),CP_UTF8)
End If
'如果之后有行,则移动到行的开头,并返回True。
如果nStartOfNextLine> 0然后
寻求#the_nFileNo,nStartOfNextLine
LineInputUTF8 = True
如果
结束函数
'目的:模拟打开 fileNameFor Input As #fileNo - 还可以通过代码页值返回此类型的文本。
'输入:the_sFileName
'the_iFileNo
'(the_nDesiredCodePage)要与此文件一起使用的代码页。
'如果此值设置为默认值-1,则表示将从文件中确定代码页。
'输出:out_nCodePage如果< the_nDesiredCodePage>返回,则只返回六个有效值。设为-1。
'CP_ACP ANSI代码页
'CP_UTF8 UTF-8
'CP_UTF16LE UTF-16小尾数(VB和NT默认字符串编码)
'CP_UTF16BE UTF-16 Big Endian
'CP_UTF32LE UTF-32 Little Endian
'CP_UTF32BE UTF-32 Big Endian
'(out_bContainedBOM)如果设置为True,则该文件以BOM(Byte Order Marker)开头。
Public Sub OpenForInput(ByRef the_sFilename As String,ByVal the_iFileNo As Integer,ByRef out_nCodePage As Long,Optional ByVal the_nDesiredCodePage As Long = -1,Optional ByRef out_bContainedBOM As Boolean)
'请注意,如果我们要考虑到每一种情况,我们应该先读取前4个字节,并检查UTF-32低端和高端的BOM,检查
'UTF-8 BOM的前三个字节,最后检查UTF-16低端和高端BOM的前两个字节。
Dim abytBOM(1到4)As Byte
Dim nCodePage As Long
'默认情况下,没有BOM。
out_bContainedBOM = False
打开the_sFilename对于二进制访问读为#the_iFileNo
'我们对-1感兴趣(确定代码页),然后各种UTF编码。
选择案例the_nDesiredCodePage
案例-1,CP_UTF8,CP_UTF16_BE,CP_UTF16_LE,CP_UTF32_BE,CP_UTF32_LE
'默认代码页。
nCodePage = CP_ACP
'拉入前四个字节以确定BOM(字节顺序标记)。
获取#the_iFileNo,abytBOM()
'以下是文本文件的BOM:
'
'FF FE UTF-16,little endian
'FE FF UTF-16,big endian
'EF BB BF UTF-8
'FF FE 00 00 UTF-32,little endian
'00 00 FE FF UTF-32, big-endian
'
'从这些信息中找出代码页。
选择案例abytBOM(1)
案例& HFF
如果abytBOM(2)=& HFE然后
如果abytBOM(3)= 0和abytBOM 4)= 0然后
nCodePage = CP_UTF32_LE
Else
nCodePage = CP_UTF16_LE
End If
End If
Case& HFE
如果abytBOM (2)=& HFF然后
nCodePage = CP_UTF16_BE
End If
Case& HEF
如果abytBOM(2)=& HBB和abytBOM(3)=& HBF然后
nCodePage = CP_UTF8
End If
Case& H0
如果abytBOM(2)=& H0和abytBOM(3)=& HFE和abytBOM(4) =& HFF然后
nCodePage = CP_UTF32_BE
结束如果
结束选择
'我们是否匹配任何BOM?
如果nCodePage = CP_ACP然后
'否 - 我们仍然默认为ANSI代码页。
'特别检查UTF-8。在UTF-8的标准中没有指定BOM,但根据维基百科(它总是正确的:-)),
'只有Microsoft在文件的开头包含这个标记。
如果FileBytesAreUTF8(the_iFileNo)然后
out_nCodePage = CP_UTF8
Else
out_nCodePage = CP_ACP
结束如果
Else
'是 - 我们已经工作从BOM出来的代码页。
'如果没有提供代码页,我们现在返回我们发现的代码页。
如果the_nDesiredCodePage = -1然后
out_nCodePage = nCodePage
End If
'通知调用者找到了一个BOM。
out_bContainedBOM = True
End If
'将文件指针重置为文件数据的开头。
如果out_bContainedBOM然后
'请注意,如果找到的代码页是两个UTF-32值之一,那么我们已经处于正确的位置。
'否则,我们必须移动到BOM结束之后。
选择案例nCodePage
案例CP_UTF16_BE,CP_UTF16_LE
寻求#the_iFileNo,3
案例CP_UTF8
寻求#the_iFileNo,4
结束选择
否则
'没有BOM,所以只需去文件的开头。
Seek #the_iFileNo,1
End If
Case Else
out_nCodePage = the_nDesiredCodePage
结束选择
End Sub
'目的:模拟'打开'fileName附加为#fileNo'
公共子OpenForAppend(ByRef the_sFilename As String,ByVal the_iFileNo As Integer,可选ByVal the_nCodePage As Long = CP_ACP,Optional ByVal the_bPrefixWithBOM As Boolean = True)
'打开文件并移动到文件的末尾。
打开the_sFilename对于二进制访问写为#the_iFileNo
寻求_iFileNo,LOF(the_iFileNo)+ 1
如果the_bPrefixWithBOM Then
WriteBOM the_iFileNo,the_nCodePage
End如果
End Sub
'目的:模拟'打开'fileName输出为#fileNo'
公共子OpenForOutput(ByRef the_sFilename As String,ByVal the_iFileNo As Integer,可选ByVal the_nCodePage As Long = CP_ACP,可选ByVal the_bPrefixWithBOM As Boolean = True)
'确保我们通过删除文件覆盖该文件...
在错误恢复下一步
Kill the_sFilename
在创建之前,错误GoTo 0
'...
打开the_sFilename对于二进制访问写为#the_iFileNo
如果the_bPrefixWithBOM Then
WriteBOM the_iFileNo,the_nCodePage
End If
End Sub
'目的:模拟'打印#fileNo,值'语句。但只允许一个值。
'设置< the_bAppendNewLine> = False与'Print #fileNo,value;'分离。
Public Sub Print_(ByVal the_iFileNo As Integer,ByRef the_sValue As String,可选ByVal the_nCodePage As Long = CP_ACP,可选ByVal the_bAppendNewLine As Boolean = True)
Const kbytNull As Byte = 0
Const kbytCarriageReturn As Byte = 13
Const kbytNewLine As Byte = 10
放置#the_iFileNo,ToCPString(the_sValue,the_nCodePage)
如果the_bAppendNewLine Then
选择案例the_nCodePage
案例CP_UTF16_BE
放置#the_iFileNo,kbytNull
放置#the_iFileNo,kbytCarriageReturn
放置#the_iFileNo,,kbytNull
放置#the_iFileNo, kbytNewLine
案例CP_UTF16_LE
放置#the_iFileNo,,kbytCarriageReturn
放置#the_iFileNo,,kbytNull
放置#the_iFileNo,,kbytNewLine
放置#the_iFileNo,,kbytNull
案例CP_UTF32_BE
放置#the_iFileNo,kbytNull
放置#the_iFileNo,kbytNull
放置#the_iFileNo,,kbytNull
放置#the_iFileNo,,kbytCarriageReturn
放置#the_iFileNo,,kbytNull
放置#the_iFileNo,kbytNull
放置#the_iFileNo,,kbytNull
放置#the_iFileNo,,kbytNewLine
案例CP_UTF32_LE
放置#the_iFileNo,kbytCarriageReturn
放置#the_iFileNo, ,kbytNull
放置#the_iFileNo,,kbytNull
放置#the_iFileNo,kbytNull
放置#the_iFileNo,,kbytNewLine
放置#the_iFileNo,kbytNull
放置#the_iFileNo, ,kbytNull
放置#the_iFileNo,,kbytNull
案例Else
放置#the_iFileNo,,kbytCarriageReturn
放置#the_iFileNo,,kbytNewLine
结束选择
结束If
End Sub
Public Sub PutContents(B yRef the_sFilename As String,ByRef the_sFileContents As String,Optional ByVal the_nCodePage As Long = CP_ACP,optional the_bPrefixWithBOM As Boolean)
Dim iFileNo As Integer
iFileNo = FreeFile
OpenForOutput the_sFilename,iFileNo,the_nCodePage,the_bPrefixWithBOM
Print_ iFileNo,the_sFileContents ,_nCodePage,False
关闭iFileNo
End Sub
'目的:转换VB字符串(UTF-16)到UTF8 - 作为二进制数组。
私有函数ToCPString(ByRef the_sValue As String,ByVal the_nCodePage As Long)As Byte()
Dim abytOutput()As Byte
Dim nValueLen As Long
Dim nOutputByteLen As Long
如果the_nCodePage = CP_UTF16_LE然后
ToCPString = the_sValue
Else
'缓存输入长度。
nValueLen = Len(the_sValue)
'查看输出缓冲区的大小。
nOutputByteLen = WideCharToMultiByte(the_nCodePage,0& StrPtr(the_sValue),nValueLen,0& 0& 0& 0&)
如果nOutputByteLen& 0然后
'将输出字节数组的大小调整为UTF-8字符串的大小。
ReDim abytOutput(1 To nOutputByteLen)
'再次调用此API,此时给出一个指向输出字节数组的指针。
WideCharToMultiByte the_nCodePage,0& StrPtr(the_sValue),nValueLen,VarPtr(abytOutput(1)),nOutputByteLen,0& 0&
End If
'返回数组。
ToCPString = abytOutput()
结束如果
端功能
私人小组WriteBOM(BYVAL the_iFileNo作为整数,BYVAL the_nCodePage只要)
'FF FE UTF-16,little endian
'FE FF UTF-16,big endian
'EF BB BF UTF-8
'FF FE 00 00 UTF-32,小端点
'00 00 FE FF UTF-32,big-endian
选择案例the_nCodePage
案例CP_UTF8
放置#the_iFileNo,CByte (安培; HEF)
将#the_iFileNo,CByte(安培; HBB)
将#the_iFileNo,CByte(安培; HBF)
案例CP_UTF16_LE
将#the_iFileNo, CByte(安培; HFF)
将#the_iFileNo,CByte(安培; HFE)
案例CP_UTF16_BE
将#the_iFileNo,CByte(安培; HFE)
将#the_iFileNo, ,CByte(& HFF)
案例CP_UTF32_LE
放置#the_iFileNo,CByte(& HFF)
放置#the_iFileNo,CByte(& HFE)
把#the_iFileNo,CByte(安培; H0)
将#the_iFileNo,CByte(安培; H0)
案例CP_UTF32_BE
将#the_iFileNo,CByte(安培; H0)
将#the_iFileNo,CByte(安培; H0)
将#the_iFileNo,CByte(安培; HFE)
将#the_iFileNo,CByte(安培; HFF)
结束选择
End Sub
将以下代码添加到具有VSFlexGrid控件的窗体中与Lucida控制台字体 - 纯粹提供一种显示尽可能多的字符的方式:
Option Explicit
Private Sub Command_Click()
Example_ConvertFileToUTF8
End Sub
Private Sub Command2_Click()
Example_IterateUTF8Lines
End Sub
Private Sub Command3_Click()
Example_ReadWriteUTF8Lines
End Sub
Private Sub Form_Load()
VSFlexGrid.ColWidth(0)= 7000!
End Sub
'目的:将*纯文本文件(UTF16,ASCII,ANSI)转换为UTF8。
Private Sub Example_ConvertFileToUTF8()
Dim nCodePage As Long
Dim bContainedBOM As Boolean
Dim sFileContents As String
'读入内容。
sFileContents = TextFile.GetContents(C:\MysteryEncoding.txt,nCodePage,,bContainedBOM)
'然后转换为UTF8。
TextFile.PutContentsC:\output.txt,sFileContents,CP_UTF8,bContainedBOM
End Sub
'目的:通过UTF的每一行迭代-8文本文件,并将其添加到可显示包含非ANSI字符的VB字符串的控件。
'在这种情况下,我正在使用Font =Lucida Console向FlexGrid添加项目。
Private Sub Example_IterateUTF8Lines()
Dim iFileNo As Integer
Dim lCodePage As Long
Dim sLine As String
iFileNo = FreeFile
TextFile.OpenForInputC:\UTF8.txt,iFileNo,lCodePage
如果lCodePage = CP_UTF8然后
尽管TextFile.LineInputUTF8(iFileNo,sLine)
VSFlexGrid.AddItem sLine
循环
VSFlexGrid.AddItem sLine
Else
MsgBox这不是一个UTF8文件。
结束如果
关闭#iFileNo
End Sub
私有子例子_ReadWriteUTF8Lines()
Dim iFileNoInput作为整数
尺寸iFileNoOutput作为整数
尺寸lCodePage只要
尺寸生产数据作为布尔值
尺寸SLINE作为字符串
iFileNoInput = FreeFile
TEXTFILE .OpenForInputC:\UTF8.txt,iFileNoInput,lCodePage,bBOM
如果lCodePage = CP_UTF8然后
iFileNoOutput = FreeFile
TextFile.OpenForOutput C:\output.txt,iFileNoOutput,lCodePage,bBOM
尽管TextFile.LineInputUTF8(iFileNoInput,sLine)
TextFile.Print_ iFileNoOutput,sLine,lCodePage
循环
TextFile.Print_ iFileNoOutput,sLine,lCodePage,False
关闭#iFileNoOutput
Else
MsgBox这不是一个UTF8文件。
如果
关闭#iFileNoInput
End Sub
I have huge external files with the "ANSI" and "UCS-2 Little Endian" encoding formats.
Now I want to change the file encoding format into UTF-8 using Visual Basic 6.0. I don't want to modify the original file; I just want to change the encoding format alone.
I have searched on the web; but can't understand the VB code, and have no idea how to do it.
I would also like to be able to read lines one at a time from UTF-8 encoded files.
NOTE. This answer has been extensively expanded to fit in with the edited question, which in turn was due to Visual Basic 6 and UTF-8
The following code wraps up converting ANSI, UTF-16 and UTF-32 encoded strings from a file to UTF-8 strings, in VB6. You have to load in the entire file and output it. Note that if it was truly generic, the LineInputUTF8() method would be LineInput(), and require a code page.
Option Explicit
Private Declare Function MultiByteToWideChar Lib "Kernel32.dll" ( _
ByVal CodePage As Long, _
ByVal dwFlags As Long, _
ByVal lpMultiByteStr As Long, _
ByVal cbMultiByte As Long, _
ByVal lpWideCharStr As Long, _
ByVal cchWideChar As Long _
) As Long
Private Declare Function WideCharToMultiByte Lib "Kernel32.dll" ( _
ByVal CodePage As Long, _
ByVal dwFlags As Long, _
ByVal lpWideCharStr As Long, _
ByVal cchWideChar As Long, _
ByVal lpMultiByteStr As Long, _
ByVal cbMultiByte As Long, _
ByVal lpDefaultChar As Long, _
ByVal lpUsedDefaultChar As Long _
) As Long
Public Const CP_ACP As Long = 0 ' Default ANSI code page.
Public Const CP_UTF8 As Long = 65001 ' UTF8.
Public Const CP_UTF16_LE As Long = 1200 ' UTF16 - little endian.
Public Const CP_UTF16_BE As Long = 1201 ' UTF16 - big endian.
Public Const CP_UTF32_LE As Long = 12000 ' UTF32 - little endian.
Public Const CP_UTF32_BE As Long = 12001 ' UTF32 - big endian.
' Purpose: Heuristic to determine whether bytes in a file are UTF-8.
Private Function FileBytesAreUTF8(ByVal the_iFileNo As Integer) As Boolean
Const knSampleByteSize As Long = 2048
Dim nLof As Long
Dim nByteCount As Long
Dim nByteIndex As Long
Dim nCharExtraByteCount As Long
Dim bytValue As Byte
' We look at the first <knSampleByteSize> bytes of the file. However, if the file is smaller, we will have to
' use the smaller size.
nLof = LOF(the_iFileNo)
If nLof < knSampleByteSize Then
nByteCount = nLof
Else
nByteCount = knSampleByteSize
End If
' Go to the start of the file.
Seek #the_iFileNo, 1
For nByteIndex = 1 To nByteCount
Get #the_iFileNo, , bytValue
' If the character we are processing has bytes beyond 1, then we are onto the next character.
If nCharExtraByteCount = 0 Then
'
' The UTF-8 specification says that the first byte of a character has masking bits which indicate how many bytes follow.
'
' See: http://en.wikipedia.org/wiki/UTF-8#Description
'
' Bytes in
' sequence Byte 1 Byte 2 Byte 3 Byte 4
' 1 0xxxxxxx
' 2 110xxxxx 10xxxxxx
' 3 1110xxxx 10xxxxxx 10xxxxxx
' 4 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
'
If (bytValue And &H80) = &H0 Then
nCharExtraByteCount = 0
ElseIf (bytValue And &HE0) = &HC0 Then
nCharExtraByteCount = 1
ElseIf (bytValue And &HF0) = &HE0 Then
nCharExtraByteCount = 2
ElseIf (bytValue And &HF8) = &HF0 Then
nCharExtraByteCount = 3
Else
' If none of these masks were matched, then this can't be a UTF-8 character.
FileBytesAreUTF8 = False
Exit Function
End If
Else
' All following bytes must be masked as in the table above.
If (bytValue And &HC0) = &H80 Then
nCharExtraByteCount = nCharExtraByteCount - 1
If nCharExtraByteCount = 0 Then
FileBytesAreUTF8 = True
End If
Else
' Not a UTF8 character.
FileBytesAreUTF8 = False
Exit Function
End If
End If
Next nByteIndex
End Function
' Purpose: Take a string whose bytes are in the byte array <the_abytCPString>, with code page <the_nCodePage>, convert to a VB string.
Private Function FromCPString(ByRef the_abytCPString() As Byte, ByVal the_nCodePage As Long) As String
Dim sOutput As String
Dim nValueLen As Long
Dim nOutputCharLen As Long
' If the code page says this is already compatible with the VB string, then just copy it into the string. No messing.
If the_nCodePage = CP_UTF16_LE Then
FromCPString = the_abytCPString()
Else
' Cache the input length.
nValueLen = UBound(the_abytCPString) - LBound(the_abytCPString) + 1
' See how big the output buffer will be.
nOutputCharLen = MultiByteToWideChar(the_nCodePage, 0&, VarPtr(the_abytCPString(LBound(the_abytCPString))), nValueLen, 0&, 0&)
' Resize output byte array to the size of the UTF-8 string.
sOutput = Space$(nOutputCharLen)
' Make this API call again, this time giving a pointer to the output byte array.
MultiByteToWideChar the_nCodePage, 0&, VarPtr(the_abytCPString(LBound(the_abytCPString))), nValueLen, StrPtr(sOutput), nOutputCharLen
' Return the array.
FromCPString = sOutput
End If
End Function
Public Function GetContents(ByVal the_sTextFile As String, ByRef out_nCodePage As Long, Optional ByVal the_nDesiredCodePage As Long = -1, Optional ByRef out_bContainedBOM As Boolean) As String
Dim iFileNo As Integer
Dim abytFileContents() As Byte
Dim nDataSize As Long
iFileNo = FreeFile
OpenForInput the_sTextFile, iFileNo, out_nCodePage, the_nDesiredCodePage, out_bContainedBOM
' We want to read the entire contents of the file (not including any BOM value).
' After calling OpenForInput(), the file pointer should be positioned after any BOM.
' So size file contents buffer to <file size> - <current position> + 1.
nDataSize = LOF(iFileNo) - Seek(iFileNo) + 1
ReDim abytFileContents(1 To nDataSize)
Get #iFileNo, , abytFileContents()
Close iFileNo
' Now we must convert this to UTF-8. But we have to first convert to the Windows NT standard UTF-16 LE.
GetContents = FromCPString(abytFileContents(), out_nCodePage)
End Function
' Purpose: Reads up to the end of the current line of the file, repositions to the beginning of the next line, if any, and
' outputs all characters found.
' Inputs: the_nFileNo The number of the file.
' Outputs: out_sLine The line from the current position in the file.
' Return: True if there is more data.
Public Function LineInputUTF8(ByVal the_nFileNo As Integer, ByRef out_sLine As String) As Boolean
Dim bytValue As Byte
Dim abytLine() As Byte
Dim nStartOfLinePos As Long
Dim nEndOfLinePos As Long
Dim nStartOfNextLine As Long
Dim nLineLen As Long
' Save the current file position as the beginning of the line, and cache this value.
nStartOfLinePos = Seek(the_nFileNo)
' Retrieves the first byte from the current position.
Get #the_nFileNo, , bytValue
' Loop until the end of file is encountered.
Do Until EOF(the_nFileNo)
' Check whether this byte represents a carriage return or line feed character (indicating new line).
If bytValue = 13 Or bytValue = 10 Then
' By this point, the current position is *after* the CR or LF character, so to get the position of the
' last byte in the line, we must go back two bytes.
nEndOfLinePos = Seek(the_nFileNo) - 2
' If this is a carriage return, then we must check the next character.
If bytValue = 13 Then
Get #the_nFileNo, , bytValue
' Is this a line feed?
If bytValue = 10 Then
' Yes. Assume that CR-LF counts as a single NewLine. So the start of the next line should skip over the line feed.
nStartOfNextLine = nEndOfLinePos + 3
Else
' No. The start of the next line is the current position.
nStartOfNextLine = nEndOfLinePos + 2
End If
ElseIf bytValue = 10 Then
' If this is a line feed, then the start of the next line is the current position.
nStartOfNextLine = nEndOfLinePos + 2
End If
' Since we have processed all the bytes in the line, exit the loop.
Exit Do
End If
' Get the next byte.
Get #the_nFileNo, , bytValue
Loop
' Check to see if there was an end of line.
If nEndOfLinePos = 0 Then
' No, this is the end of the file - so use all the remaining characters.
nLineLen = Seek(the_nFileNo) - nStartOfLinePos - 1
Else
' Yes - so use all the characters up to the end of line position.
nLineLen = nEndOfLinePos - nStartOfLinePos + 1
End If
' Is this line empty?
If nLineLen = 0 Then
' Yes - just return an empty string.
out_sLine = vbNullString
Else
' No - pull all the bytes from the beginning to the end of the line into a byte array, and then convert that from UTF-8 to a VB string.
ReDim abytLine(1 To nLineLen)
Get #the_nFileNo, nStartOfLinePos, abytLine()
out_sLine = FromCPString(abytLine(), CP_UTF8)
End If
' If there is a line afterwards, then move to the beginning of the line, and return True.
If nStartOfNextLine > 0 Then
Seek #the_nFileNo, nStartOfNextLine
LineInputUTF8 = True
End If
End Function
' Purpose: Analogue of 'Open "fileName" For Input As #fileNo' - but also return what type of text this is via a Code Page value.
' Inputs: the_sFileName
' the_iFileNo
' (the_nDesiredCodePage) The code page that you want to use with this file.
' If this value is set to the default, -1, this indicates that the code page will be ascertained from the file.
' Outputs: out_nCodePage There are only six valid values that are returned if <the_nDesiredCodePage> was set to -1.
' CP_ACP ANSI code page
' CP_UTF8 UTF-8
' CP_UTF16LE UTF-16 Little Endian (VB and NT default string encoding)
' CP_UTF16BE UTF-16 Big Endian
' CP_UTF32LE UTF-32 Little Endian
' CP_UTF32BE UTF-32 Big Endian
' (out_bContainedBOM) If this was set to True, then the file started with a BOM (Byte Order Marker).
Public Sub OpenForInput(ByRef the_sFilename As String, ByVal the_iFileNo As Integer, ByRef out_nCodePage As Long, Optional ByVal the_nDesiredCodePage As Long = -1, Optional ByRef out_bContainedBOM As Boolean)
' Note if we want to take account of every case, we should read in the first four bytes, and check for UTF-32 low and high endian BOMs, check
' the first three bytes for the UTF-8 BOM, and finally check the first two bytes for UTF-16 low and hight endian BOMs.
Dim abytBOM(1 To 4) As Byte
Dim nCodePage As Long
' By default, there is no BOM.
out_bContainedBOM = False
Open the_sFilename For Binary Access Read As #the_iFileNo
' We are interested in -1 (ascertain code page), and then various UTF encodings.
Select Case the_nDesiredCodePage
Case -1, CP_UTF8, CP_UTF16_BE, CP_UTF16_LE, CP_UTF32_BE, CP_UTF32_LE
' Default code page.
nCodePage = CP_ACP
' Pull in the first four bytes to determine the BOM (byte order marker).
Get #the_iFileNo, , abytBOM()
' The following are the BOMs for text files:
'
' FF FE UTF-16, little endian
' FE FF UTF-16, big endian
' EF BB BF UTF-8
' FF FE 00 00 UTF-32, little endian
' 00 00 FE FF UTF-32, big-endian
'
' Work out the code page from this information.
Select Case abytBOM(1)
Case &HFF
If abytBOM(2) = &HFE Then
If abytBOM(3) = 0 And abytBOM(4) = 0 Then
nCodePage = CP_UTF32_LE
Else
nCodePage = CP_UTF16_LE
End If
End If
Case &HFE
If abytBOM(2) = &HFF Then
nCodePage = CP_UTF16_BE
End If
Case &HEF
If abytBOM(2) = &HBB And abytBOM(3) = &HBF Then
nCodePage = CP_UTF8
End If
Case &H0
If abytBOM(2) = &H0 And abytBOM(3) = &HFE And abytBOM(4) = &HFF Then
nCodePage = CP_UTF32_BE
End If
End Select
' Did we match any BOMs?
If nCodePage = CP_ACP Then
' No - we are still defaulting to the ANSI code page.
' Special check for UTF-8. The BOM is not specified in the standard for UTF-8, but according to Wikipedia (which is always right :-) ),
' only Microsoft includes this marker at the beginning of files.
If FileBytesAreUTF8(the_iFileNo) Then
out_nCodePage = CP_UTF8
Else
out_nCodePage = CP_ACP
End If
Else
' Yes - we have worked out the code page from the BOM.
' If no code page was suggested, we now return the code page we found.
If the_nDesiredCodePage = -1 Then
out_nCodePage = nCodePage
End If
' Inform the caller that a BOM was found.
out_bContainedBOM = True
End If
' Reset the file pointer to the beginning of the file data.
If out_bContainedBOM Then
' Note that if the code page found was one of the two UTF-32 values, then we are already in the correct position.
' Otherwise, we have to move to just after the end of the BOM.
Select Case nCodePage
Case CP_UTF16_BE, CP_UTF16_LE
Seek #the_iFileNo, 3
Case CP_UTF8
Seek #the_iFileNo, 4
End Select
Else
' There is no BOM, so simply go the beginning of the file.
Seek #the_iFileNo, 1
End If
Case Else
out_nCodePage = the_nDesiredCodePage
End Select
End Sub
' Purpose: Analogue of 'Open "fileName" For Append As #fileNo'
Public Sub OpenForAppend(ByRef the_sFilename As String, ByVal the_iFileNo As Integer, Optional ByVal the_nCodePage As Long = CP_ACP, Optional ByVal the_bPrefixWithBOM As Boolean = True)
' Open the file and move to the end of the file.
Open the_sFilename For Binary Access Write As #the_iFileNo
Seek the_iFileNo, LOF(the_iFileNo) + 1
If the_bPrefixWithBOM Then
WriteBOM the_iFileNo, the_nCodePage
End If
End Sub
' Purpose: Analogue of 'Open "fileName" For Output As #fileNo'
Public Sub OpenForOutput(ByRef the_sFilename As String, ByVal the_iFileNo As Integer, Optional ByVal the_nCodePage As Long = CP_ACP, Optional ByVal the_bPrefixWithBOM As Boolean = True)
' Ensure we overwrite the file by deleting it ...
On Error Resume Next
Kill the_sFilename
On Error GoTo 0
' ... before creating it.
Open the_sFilename For Binary Access Write As #the_iFileNo
If the_bPrefixWithBOM Then
WriteBOM the_iFileNo, the_nCodePage
End If
End Sub
' Purpose: Analogue of the 'Print #fileNo, value' statement. But only one value allowed.
' Setting <the_bAppendNewLine> = False is analagous to 'Print #fileNo, value;'.
Public Sub Print_(ByVal the_iFileNo As Integer, ByRef the_sValue As String, Optional ByVal the_nCodePage As Long = CP_ACP, Optional ByVal the_bAppendNewLine As Boolean = True)
Const kbytNull As Byte = 0
Const kbytCarriageReturn As Byte = 13
Const kbytNewLine As Byte = 10
Put #the_iFileNo, , ToCPString(the_sValue, the_nCodePage)
If the_bAppendNewLine Then
Select Case the_nCodePage
Case CP_UTF16_BE
Put #the_iFileNo, , kbytNull
Put #the_iFileNo, , kbytCarriageReturn
Put #the_iFileNo, , kbytNull
Put #the_iFileNo, , kbytNewLine
Case CP_UTF16_LE
Put #the_iFileNo, , kbytCarriageReturn
Put #the_iFileNo, , kbytNull
Put #the_iFileNo, , kbytNewLine
Put #the_iFileNo, , kbytNull
Case CP_UTF32_BE
Put #the_iFileNo, , kbytNull
Put #the_iFileNo, , kbytNull
Put #the_iFileNo, , kbytNull
Put #the_iFileNo, , kbytCarriageReturn
Put #the_iFileNo, , kbytNull
Put #the_iFileNo, , kbytNull
Put #the_iFileNo, , kbytNull
Put #the_iFileNo, , kbytNewLine
Case CP_UTF32_LE
Put #the_iFileNo, , kbytCarriageReturn
Put #the_iFileNo, , kbytNull
Put #the_iFileNo, , kbytNull
Put #the_iFileNo, , kbytNull
Put #the_iFileNo, , kbytNewLine
Put #the_iFileNo, , kbytNull
Put #the_iFileNo, , kbytNull
Put #the_iFileNo, , kbytNull
Case Else
Put #the_iFileNo, , kbytCarriageReturn
Put #the_iFileNo, , kbytNewLine
End Select
End If
End Sub
Public Sub PutContents(ByRef the_sFilename As String, ByRef the_sFileContents As String, Optional ByVal the_nCodePage As Long = CP_ACP, Optional the_bPrefixWithBOM As Boolean)
Dim iFileNo As Integer
iFileNo = FreeFile
OpenForOutput the_sFilename, iFileNo, the_nCodePage, the_bPrefixWithBOM
Print_ iFileNo, the_sFileContents, the_nCodePage, False
Close iFileNo
End Sub
' Purpose: Converts a VB string (UTF-16) to UTF8 - as a binary array.
Private Function ToCPString(ByRef the_sValue As String, ByVal the_nCodePage As Long) As Byte()
Dim abytOutput() As Byte
Dim nValueLen As Long
Dim nOutputByteLen As Long
If the_nCodePage = CP_UTF16_LE Then
ToCPString = the_sValue
Else
' Cache the input length.
nValueLen = Len(the_sValue)
' See how big the output buffer will be.
nOutputByteLen = WideCharToMultiByte(the_nCodePage, 0&, StrPtr(the_sValue), nValueLen, 0&, 0&, 0&, 0&)
If nOutputByteLen > 0 Then
' Resize output byte array to the size of the UTF-8 string.
ReDim abytOutput(1 To nOutputByteLen)
' Make this API call again, this time giving a pointer to the output byte array.
WideCharToMultiByte the_nCodePage, 0&, StrPtr(the_sValue), nValueLen, VarPtr(abytOutput(1)), nOutputByteLen, 0&, 0&
End If
' Return the array.
ToCPString = abytOutput()
End If
End Function
Private Sub WriteBOM(ByVal the_iFileNo As Integer, ByVal the_nCodePage As Long)
' FF FE UTF-16, little endian
' FE FF UTF-16, big endian
' EF BB BF UTF-8
' FF FE 00 00 UTF-32, little endian
' 00 00 FE FF UTF-32, big-endian
Select Case the_nCodePage
Case CP_UTF8
Put #the_iFileNo, , CByte(&HEF)
Put #the_iFileNo, , CByte(&HBB)
Put #the_iFileNo, , CByte(&HBF)
Case CP_UTF16_LE
Put #the_iFileNo, , CByte(&HFF)
Put #the_iFileNo, , CByte(&HFE)
Case CP_UTF16_BE
Put #the_iFileNo, , CByte(&HFE)
Put #the_iFileNo, , CByte(&HFF)
Case CP_UTF32_LE
Put #the_iFileNo, , CByte(&HFF)
Put #the_iFileNo, , CByte(&HFE)
Put #the_iFileNo, , CByte(&H0)
Put #the_iFileNo, , CByte(&H0)
Case CP_UTF32_BE
Put #the_iFileNo, , CByte(&H0)
Put #the_iFileNo, , CByte(&H0)
Put #the_iFileNo, , CByte(&HFE)
Put #the_iFileNo, , CByte(&HFF)
End Select
End Sub
The following code was added to a Form which had a VSFlexGrid control with Lucida Console font - purely to provide a way to display as many characters as possible:
Option Explicit
Private Sub Command_Click()
Example_ConvertFileToUTF8
End Sub
Private Sub Command2_Click()
Example_IterateUTF8Lines
End Sub
Private Sub Command3_Click()
Example_ReadWriteUTF8Lines
End Sub
Private Sub Form_Load()
VSFlexGrid.ColWidth(0) = 7000!
End Sub
' Purpose: Converts *any* pure text file (UTF16, ASCII, ANSI) to UTF8.
Private Sub Example_ConvertFileToUTF8()
Dim nCodePage As Long
Dim bContainedBOM As Boolean
Dim sFileContents As String
' Read in contents.
sFileContents = TextFile.GetContents("C:\MysteryEncoding.txt", nCodePage, , bContainedBOM)
' And then convert to UTF8.
TextFile.PutContents "C:\output.txt", sFileContents, CP_UTF8, bContainedBOM
End Sub
' Purpose: Iterates through each line of a UTF-8 text file, and adds it to a control which can display VB strings containing non-ANSI characters.
' In this case, I am adding items to a FlexGrid with Font = "Lucida Console".
Private Sub Example_IterateUTF8Lines()
Dim iFileNo As Integer
Dim lCodePage As Long
Dim sLine As String
iFileNo = FreeFile
TextFile.OpenForInput "C:\UTF8.txt", iFileNo, lCodePage
If lCodePage = CP_UTF8 Then
Do While TextFile.LineInputUTF8(iFileNo, sLine)
VSFlexGrid.AddItem sLine
Loop
VSFlexGrid.AddItem sLine
Else
MsgBox "This is not a UTF8 file."
End If
Close #iFileNo
End Sub
Private Sub Example_ReadWriteUTF8Lines()
Dim iFileNoInput As Integer
Dim iFileNoOutput As Integer
Dim lCodePage As Long
Dim bBOM As Boolean
Dim sLine As String
iFileNoInput = FreeFile
TextFile.OpenForInput "C:\UTF8.txt", iFileNoInput, lCodePage, , bBOM
If lCodePage = CP_UTF8 Then
iFileNoOutput = FreeFile
TextFile.OpenForOutput "C:\output.txt", iFileNoOutput, lCodePage, bBOM
Do While TextFile.LineInputUTF8(iFileNoInput, sLine)
TextFile.Print_ iFileNoOutput, sLine, lCodePage
Loop
TextFile.Print_ iFileNoOutput, sLine, lCodePage, False
Close #iFileNoOutput
Else
MsgBox "This is not a UTF8 file."
End If
Close #iFileNoInput
End Sub
这篇关于VB 6.0中文本文件的编码的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持!