refactor: Preserve original metadata when embedding GUID

2025-06-03 15:59:42 -06:00
parent 86f3a08bbc
commit 1c083132a2
1 changed files with 44 additions and 31 deletions
@@ -27,46 +27,59 @@ def modify_feed(context, feed):
            guid_text_to_embed = f" Guid: {new_guid_str}"

            source_path = article.source_path
-            
-            # Reconstruct metadata header from article.metadata
-            # article.metadata keys are typically lowercase. Capitalize them for convention.
-            metadata_header_lines = []
-            for key, value in article.metadata.items():
-                if isinstance(value, list):
-                    # Convert list items to string and join with comma (e.g., for tags)
-                    metadata_header_lines.append(f"{key.capitalize()}: {', '.join(map(str, value))}")
-                else:
-                    # Ensure value is string for concatenation
-                    metadata_header_lines.append(f"{key.capitalize()}: {str(value)}")
-            
-            # article._content holds the raw Markdown content string (after metadata parsing)
+
+            # Ensure article object has the _content attribute
            if not hasattr(article, '_content'):
                log.error(f"Article '{article.title}' does not have '_content' attribute. Cannot embed Guid into source file.")
-                # This is a critical issue for the requested operation.
                raise Exception(f"Cannot find raw content for article '{article.title}' to embed Guid.")

-            markdown_body = article._content
+            # Read the original file content.
+            # Python's open() in text mode uses universal newlines by default, converting \r\n and \r to \n.
+            # Pelican's MarkdownReader also provides article._content with \n newlines.
+            try:
+                with open(source_path, 'r', encoding='utf-8') as f:
+                    original_file_content_universal_newlines = f.read()
+            except Exception as e:
+                log.error(f"Failed to read original content from '{source_path}': {e}")
+                raise

-            # Split the markdown body into the first paragraph and the rest
-            # Paragraphs in Markdown are separated by one or more blank lines (\n\n)
-            parts = markdown_body.split('\n\n', 1)
-            first_paragraph_text = parts[0]
-            rest_of_body = parts[1] if len(parts) > 1 else ""
+            # Sanity check: the article's body content should be a suffix of the read file content.
+            if not original_file_content_universal_newlines.endswith(article._content):
+                log.error(f"Content mismatch for '{article.title}' in '{source_path}'. "
+                          "The article's parsed content (article._content) does not match the "
+                          "ending of the raw file (read with universal newlines). This is unexpected "
+                          "and may indicate issues with file parsing or concurrent modifications.")
+                # For debugging, one might log tails of both strings here.
+                # log.debug(f"Tail of original file content: '{original_file_content_universal_newlines[-200:]}'")
+                # log.debug(f"Tail of article._content: '{article._content[-200:]}'")
+                raise Exception(f"Content boundary determination error for article '{article.title}'.")
+            
+            # Determine the metadata part by subtracting the length of the content body.
+            metadata_section_length = len(original_file_content_universal_newlines) - len(article._content)
+            metadata_part_from_file = original_file_content_universal_newlines[:metadata_section_length]

-            # Append the Guid text to the end of the first paragraph
+            # current_body_content is what Pelican parsed as the article's body.
+            current_body_content = article._content
+
+            # Split this body content to find its first paragraph.
+            # Paragraphs in Markdown are separated by one or more blank lines (\n\n).
+            body_parts = current_body_content.split('\n\n', 1)
+            first_paragraph_of_body = body_parts[0]
+            rest_of_body_content = body_parts[1] if len(body_parts) > 1 else ""
+
+            # Append the Guid text to the end of the first paragraph of the body.
            # .rstrip() removes any trailing whitespace/newlines from the paragraph itself before appending.
-            modified_first_paragraph = first_paragraph_text.rstrip() + guid_text_to_embed
+            modified_first_paragraph_of_body = first_paragraph_of_body.rstrip() + guid_text_to_embed
            
-            # Reconstruct the new markdown body
-            new_markdown_body = modified_first_paragraph
-            if rest_of_body: # Add back the rest of the body with the double newline separator
-                new_markdown_body += '\n\n' + rest_of_body
+            # Reconstruct the new body content with the embedded Guid.
+            new_body_content_with_guid = modified_first_paragraph_of_body
+            if rest_of_body_content: # Add back the rest of the body if it existed.
+                new_body_content_with_guid += '\n\n' + rest_of_body_content
            
-            # Combine metadata and new body to form the complete new file content
-            if metadata_header_lines:
-                full_new_content = "\n".join(metadata_header_lines) + "\n\n" + new_markdown_body
-            else: # No metadata, just the body
-                full_new_content = new_markdown_body
+            # Construct the full new file content by combining the original metadata part and the new body.
+            # This preserves the original metadata block verbatim (including comments, formatting, and original newline characters if any within it,
+            # as metadata_part_from_file is a direct slice from original_file_content_universal_newlines which has \n newlines).
+            full_new_content = metadata_part_from_file + new_body_content_with_guid

            try:
                with open(source_path, 'w', encoding='utf-8') as f: