Skip to content

Instantly share code, notes, and snippets.

@suanmiao
Created October 23, 2023 22:42

Revisions

  1. suanmiao created this gist Oct 23, 2023.
    32 changes: 32 additions & 0 deletions data_cleaning_mpt_7b_chat_experiment.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,32 @@
    #Please note that the code below is specifically optimized for MPT-7B-Chat model on the Databricks Documentation dataset, modifications may be required for other models or datasets.


    import re
    from html import unescape

    # Keep all HTML tags
    # No code is needed as we are keeping all HTML tags

    # Replace multiple newlines or multiple empty lines with one newline
    text = re.sub(r'\n\s*\n', '\n', text)

    # Keep URLs
    # No code is needed as we are keeping all URLs

    # Remove extra spaces but keep single newlines
    text = re.sub(r'[ ]+', ' ', text)

    # Decoding HTML entities
    text = unescape(text)

    # Modify the rule to keep special characters or symbols that are important
    text = re.sub(r'[^\w\s\[\]\(\)\$\\.\n\/:#<>{},_"!@\\-\\*=\]', '', text)

    # Remove repeated phrases with single phrase
    phrase_set = set()
    phrase_list = []
    for phrase in text.split():
    if phrase not in phrase_set:
    phrase_set.add(phrase)
    phrase_list.append(phrase)
    text = ' '.join(phrase_list)