Tiền Xử Lý Văn Bản Trong Ngôn Ngữ Tự Nhiên

Tiền xử lý văn bản hay nói cách khác là làm sạch văn bản là giai đoạn rất quan trọng, nó giúp loại bỏ các văn bản “thừa” giúp cho việc trích xuất các đặc trưng được tốt nhất mang lại hiệu quả cao nhất, đồng thời giúp nâng cao chất lượng xử lí của các mô hình và thuật toán tốt hơn.

1. Loại bỏ URL

#========== LOẠI BỎ URL ==========
import re
with open(“dataout/Căng_Chỉ.txt”, “w”, encoding=”utf-8-sig”) as file:

raw_text = re.sub(r”http\S+”, “”, raw_text)
file.write(raw_text)
print(raw_text)

2. Đưa về dạng viết thường

#========== BIẾN ĐỔI VỀ CHỮ THƯỜNG ==========

with open(“dataout/Căng_Chỉ.txt”, “w”, encoding=”utf-8-sig”) as file:

raw_text = raw_text.lower()
file.write(raw_text)
print(raw_text)

3. Xoá khoảng trắng thừa

#========== LOẠI BỎ CÁC KHOẢNG TRẮNG THỪA ==========
import re

with open(“dataout/Cấy_Tóc.txt”, “r”, encoding=”utf-8-sig”) as file:
raw_text = re.sub(r’\s+’, ‘ ‘, file.read()).strip()

with open(“dataout/Cấy_Tóc.txt”, “w”, encoding=”utf-8-sig”) as file:
file.write(raw_text)

print(raw_text)

4. Xoá Stopword theo danh sách từ định sẵn

#========== XOÁ CÁC STOPWORD VÀ KHOẢNG TRẮNG THỪA ==========
stopword = [‘share:’]

with open(“dataout/Cấy_Tóc.txt”, “r”, encoding=”utf-8-sig”) as file:
raw_text = file.read()
len_raw_text = raw_text

raw_text = [raw_text for raw_text in raw_text.split() if raw_text.lower() not in stopword]
new_text = ” “.join(raw_text)

with open(“dataout/Cấy_Tóc.txt”, “w”, encoding=”utf-8-sig”) as file:
file.write(new_text)

print(new_text)
print(“Độ dài (số lượng từ) cũ của text: “, len(len_raw_text))
print(“Độ dài (số lượng từ) mới của text: “, len(new_text))

4. Xoá các Emoji

#========== XOÁ CÁC EMOJI ==========
import re
emoji_pattern = re.compile(“[“
u”\U0001F600-\U0001F64F” # emoticons
u”\U0001F300-\U0001F5FF” # symbols & pictographs
u”\U0001F680-\U0001F6FF” # transport & map symbols
u”\U0001F1E0-\U0001F1FF” # flags (iOS)
u”\U00002500-\U00002BEF” # chinese char
u”\U00002702-\U000027B0″
u”\U00002702-\U000027B0″
u”\U000024C2-\U0001F251″
u”\U0001f926-\U0001f937″
u”\U00010000-\U0010ffff”
u”\u2640-\u2642″
u”\u2600-\u2B55″
u”\u200d”
u”\u23cf”
u”\u23e9″
u”\u231a”
u”\ufe0f” # dingbats
u”\u3030″
“]+”, flags=re.UNICODE)

with open(“dataout/Cấy_Tóc.txt”, “r”, encoding=”utf-8-sig”) as file:

raw_text = file.read()
len_raw_text = raw_text

new_text = emoji_pattern.sub(r”, raw_text)

with open(“dataout/Cấy_Tóc.txt”, “w”, encoding=”utf-8-sig”) as file:

file.write(new_text)

print(new_text)
print(“Độ dài (số lượng từ) cũ của text: “, len(len_raw_text))
print(“Độ dài (số lượng từ) mới của text: “, len(new_text))