1

I wrote code to highlight some keywords from a txt file and save the output in a docx file. The output is coming perfectly, however, when I tried to save the file, it showed some errors. The code is as below:

import re
from termcolor import colored
from docx import Document
from docx.shared import Pt
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT

def mark_keywords(text, keywords):
    marked_text = text
    for keyword in keywords:
        pattern = r'\b{}\b'.format(re.escape(keyword))
        marked_text = re.sub(pattern, colored(keyword, 'red', attrs=['bold']), marked_text)
    return marked_text

# Set the input file name
input_file = "input.txt"

try:
    # Read input text from the file
    with open(input_file, 'r') as file:
        input_text = file.read()
except FileNotFoundError:
    print("File not found.")
    exit()

# Get input keywords from the user
input_keywords = ["we","We","This Paper","This paper", "this paper", "this Paper", "our", "Our", "this study", "This study","This Study","this research", "This Research", "This research"]

# Clean up the keywords (remove leading/trailing whitespaces)
input_keywords = [keyword.strip() for keyword in input_keywords]

# Mark keywords in the input text
marked_text = mark_keywords(input_text, input_keywords)


# Create a new Word document
document = Document()
    
# Split the marked text into paragraphs
paragraphs = marked_text.split("\n\n")

# Add each paragraph to the document
for paragraph_text in paragraphs:
    paragraph = document.add_paragraph(paragraph_text)

# Save the document as output.docx
output_file = "output.docx"
document.save(output_file)

And, error is coming like this:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[17], line 44
     42 # Add each paragraph to the document
     43 for paragraph_text in paragraphs:
---> 44     paragraph = document.add_paragraph(paragraph_text)
     46 # Save the document as output.docx
     47 output_file = "output.docx"

File ~\anaconda3\lib\site-packages\docx\document.py:56, in Document.add_paragraph(self, text, style)
     47 def add_paragraph(self, text='', style=None):
     48     """
     49     Return a paragraph newly added to the end of the document, populated
     50     with *text* and having paragraph style *style*. *text* can contain
   (...)
     54     break.
     55     """
---> 56     return self._body.add_paragraph(text, style)

File ~\anaconda3\lib\site-packages\docx\blkcntnr.py:37, in BlockItemContainer.add_paragraph(self, text, style)
     35 paragraph = self._add_paragraph()
     36 if text:
---> 37     paragraph.add_run(text)
     38 if style is not None:
     39     paragraph.style = style

File ~\anaconda3\lib\site-packages\docx\text\paragraph.py:37, in Paragraph.add_run(self, text, style)
     35 run = Run(r, self)
     36 if text:
---> 37     run.text = text
     38 if style:
     39     run.style = style

File ~\anaconda3\lib\site-packages\docx\text\run.py:163, in Run.text(self, text)
    161 @text.setter
    162 def text(self, text):
--> 163     self._r.text = text

File ~\anaconda3\lib\site-packages\docx\oxml\text\run.py:104, in CT_R.text(self, text)
    101 @text.setter
    102 def text(self, text):
    103     self.clear_content()
--> 104     _RunContentAppender.append_to_run_from_text(self, text)

File ~\anaconda3\lib\site-packages\docx\oxml\text\run.py:134, in _RunContentAppender.append_to_run_from_text(cls, r, text)
    128 """
    129 Create a "one-shot" ``_RunContentAppender`` instance and use it to
    130 append the run content elements corresponding to *text* to the
    131 ``<w:r>`` element *r*.
    132 """
    133 appender = cls(r)
--> 134 appender.add_text(text)

File ~\anaconda3\lib\site-packages\docx\oxml\text\run.py:143, in _RunContentAppender.add_text(self, text)
    141 for char in text:
    142     self.add_char(char)
--> 143 self.flush()

File ~\anaconda3\lib\site-packages\docx\oxml\text\run.py:165, in _RunContentAppender.flush(self)
    163 text = ''.join(self._bfr)
    164 if text:
--> 165     self._r.add_t(text)
    166 del self._bfr[:]

File ~\anaconda3\lib\site-packages\docx\oxml\text\run.py:41, in CT_R.add_t(self, text)
     37 def add_t(self, text):
     38     """
     39     Return a newly added ``<w:t>`` element containing *text*.
     40     """
---> 41     t = self._add_t(text=text)
     42     if len(text.strip()) < len(text):
     43         t.set(qn('xml:space'), 'preserve')

File ~\anaconda3\lib\site-packages\docx\oxml\xmlchemy.py:273, in _BaseChildElement._add_adder.<locals>._add_child(obj, **attrs)
    271 child = new_method()
    272 for key, value in attrs.items():
--> 273     setattr(child, key, value)
    274 insert_method = getattr(obj, self._insert_method_name)
    275 insert_method(child)

File src\lxml\etree.pyx:1042, in lxml.etree._Element.text.__set__()

File src\lxml\apihelpers.pxi:748, in lxml.etree._setNodeText()

File src\lxml\apihelpers.pxi:736, in lxml.etree._createTextNode()

File src\lxml\apihelpers.pxi:1541, in lxml.etree._utf8()

ValueError: All strings must be XML compatible: Unicode or ASCII, no NULL bytes or control characters

May be this error is coming from my given text. But I can't find from where exactly this error is coming from.

Can I get a solution for this?

rabby26
  • 13
  • 3

1 Answers1

0

It could be because you are trying to save output of colored() from termcolor which adds special characters to its input text which is rendered by the terminal.

If you want to show colored output in docx, here is an example that might help!

Kartoos
  • 737
  • 1
  • 8
  • 24