I wrote code to highlight some keywords from a txt file and save the output in a docx file. The output is coming perfectly, however, when I tried to save the file, it showed some errors. The code is as below:
import re
from termcolor import colored
from docx import Document
from docx.shared import Pt
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
def mark_keywords(text, keywords):
marked_text = text
for keyword in keywords:
pattern = r'\b{}\b'.format(re.escape(keyword))
marked_text = re.sub(pattern, colored(keyword, 'red', attrs=['bold']), marked_text)
return marked_text
# Set the input file name
input_file = "input.txt"
try:
# Read input text from the file
with open(input_file, 'r') as file:
input_text = file.read()
except FileNotFoundError:
print("File not found.")
exit()
# Get input keywords from the user
input_keywords = ["we","We","This Paper","This paper", "this paper", "this Paper", "our", "Our", "this study", "This study","This Study","this research", "This Research", "This research"]
# Clean up the keywords (remove leading/trailing whitespaces)
input_keywords = [keyword.strip() for keyword in input_keywords]
# Mark keywords in the input text
marked_text = mark_keywords(input_text, input_keywords)
# Create a new Word document
document = Document()
# Split the marked text into paragraphs
paragraphs = marked_text.split("\n\n")
# Add each paragraph to the document
for paragraph_text in paragraphs:
paragraph = document.add_paragraph(paragraph_text)
# Save the document as output.docx
output_file = "output.docx"
document.save(output_file)
And, error is coming like this:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[17], line 44
42 # Add each paragraph to the document
43 for paragraph_text in paragraphs:
---> 44 paragraph = document.add_paragraph(paragraph_text)
46 # Save the document as output.docx
47 output_file = "output.docx"
File ~\anaconda3\lib\site-packages\docx\document.py:56, in Document.add_paragraph(self, text, style)
47 def add_paragraph(self, text='', style=None):
48 """
49 Return a paragraph newly added to the end of the document, populated
50 with *text* and having paragraph style *style*. *text* can contain
(...)
54 break.
55 """
---> 56 return self._body.add_paragraph(text, style)
File ~\anaconda3\lib\site-packages\docx\blkcntnr.py:37, in BlockItemContainer.add_paragraph(self, text, style)
35 paragraph = self._add_paragraph()
36 if text:
---> 37 paragraph.add_run(text)
38 if style is not None:
39 paragraph.style = style
File ~\anaconda3\lib\site-packages\docx\text\paragraph.py:37, in Paragraph.add_run(self, text, style)
35 run = Run(r, self)
36 if text:
---> 37 run.text = text
38 if style:
39 run.style = style
File ~\anaconda3\lib\site-packages\docx\text\run.py:163, in Run.text(self, text)
161 @text.setter
162 def text(self, text):
--> 163 self._r.text = text
File ~\anaconda3\lib\site-packages\docx\oxml\text\run.py:104, in CT_R.text(self, text)
101 @text.setter
102 def text(self, text):
103 self.clear_content()
--> 104 _RunContentAppender.append_to_run_from_text(self, text)
File ~\anaconda3\lib\site-packages\docx\oxml\text\run.py:134, in _RunContentAppender.append_to_run_from_text(cls, r, text)
128 """
129 Create a "one-shot" ``_RunContentAppender`` instance and use it to
130 append the run content elements corresponding to *text* to the
131 ``<w:r>`` element *r*.
132 """
133 appender = cls(r)
--> 134 appender.add_text(text)
File ~\anaconda3\lib\site-packages\docx\oxml\text\run.py:143, in _RunContentAppender.add_text(self, text)
141 for char in text:
142 self.add_char(char)
--> 143 self.flush()
File ~\anaconda3\lib\site-packages\docx\oxml\text\run.py:165, in _RunContentAppender.flush(self)
163 text = ''.join(self._bfr)
164 if text:
--> 165 self._r.add_t(text)
166 del self._bfr[:]
File ~\anaconda3\lib\site-packages\docx\oxml\text\run.py:41, in CT_R.add_t(self, text)
37 def add_t(self, text):
38 """
39 Return a newly added ``<w:t>`` element containing *text*.
40 """
---> 41 t = self._add_t(text=text)
42 if len(text.strip()) < len(text):
43 t.set(qn('xml:space'), 'preserve')
File ~\anaconda3\lib\site-packages\docx\oxml\xmlchemy.py:273, in _BaseChildElement._add_adder.<locals>._add_child(obj, **attrs)
271 child = new_method()
272 for key, value in attrs.items():
--> 273 setattr(child, key, value)
274 insert_method = getattr(obj, self._insert_method_name)
275 insert_method(child)
File src\lxml\etree.pyx:1042, in lxml.etree._Element.text.__set__()
File src\lxml\apihelpers.pxi:748, in lxml.etree._setNodeText()
File src\lxml\apihelpers.pxi:736, in lxml.etree._createTextNode()
File src\lxml\apihelpers.pxi:1541, in lxml.etree._utf8()
ValueError: All strings must be XML compatible: Unicode or ASCII, no NULL bytes or control characters
May be this error is coming from my given text. But I can't find from where exactly this error is coming from.
Can I get a solution for this?