Created
October 23, 2023 22:42
Revisions
-
suanmiao created this gist
Oct 23, 2023 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,32 @@ #Please note that the code below is specifically optimized for MPT-7B-Chat model on the Databricks Documentation dataset, modifications may be required for other models or datasets. import re from html import unescape # Keep all HTML tags # No code is needed as we are keeping all HTML tags # Replace multiple newlines or multiple empty lines with one newline text = re.sub(r'\n\s*\n', '\n', text) # Keep URLs # No code is needed as we are keeping all URLs # Remove extra spaces but keep single newlines text = re.sub(r'[ ]+', ' ', text) # Decoding HTML entities text = unescape(text) # Modify the rule to keep special characters or symbols that are important text = re.sub(r'[^\w\s\[\]\(\)\$\\.\n\/:#<>{},_"!@\\-\\*=\]', '', text) # Remove repeated phrases with single phrase phrase_set = set() phrase_list = [] for phrase in text.split(): if phrase not in phrase_set: phrase_set.add(phrase) phrase_list.append(phrase) text = ' '.join(phrase_list)