import { Schema } from "prosemirror-model";
import {
  MarkdownParser,
  schema as baseSchema,
  defaultMarkdownParser,
} from "prosemirror-markdown";
import markdownit from "markdown-it";
import { tableNodes } from "./tableNodes";
import DOMPurify from "dompurify";

// Define the strikethrough mark
const strikethroughMark = {
  strikethrough: {
    parseDOM: [{ tag: "s" }, { tag: "del" }, { tag: "strike" }],
    toDOM() {
      return ["s", 0] as readonly [string, ...any[]];
    },
  },
};

const htmlInlineNode = {
  html_inline: {
    group: "inline",
    inline: true,
    atom: true,
    parseDOM: [
      {
        tag: "span",
        getAttrs: (node) => {
          return node.tagName === "BR" ? {} : false; // Allow only <br> content
        },
      },
    ],
    toDOM(node) {
      // Return a <br> tag only for content that matches <br>, ignore other inline HTML
      const ret = node.attrs.isBreak ? ["br"] : ["span"];
      return ret as unknown as readonly [string, ...any[]];
    },
  },
};

// Extend the basic schema with table nodes
export const schema = new Schema({
  nodes: baseSchema.spec.nodes
    .append(
      tableNodes({
        tableGroup: "block",
        cellContent: "inline+",
        cellAttributes: {},
      })
    )
    .append(htmlInlineNode),
  marks: baseSchema.spec.marks.append(strikethroughMark),
});

// Initialize markdown-it with table support
const MarkdownIt = markdownit({
  typographer: false,
  breaks: true,
  linkify: true,
  html: true,
  xhtmlOut: true,
}).disable(["image"]);

// Add custom rule to handle only <br> tags, ignore other inline HTML
MarkdownIt.renderer.rules.html_inline = (tokens, idx) => {
  const content = tokens?.[idx]?.content;
  return content === "<br>" ? "<br>" : ""; // Return <br> tag, ignore others
};

// Add custom rule to preserve the correct ordered list numbering, so if the md has an ordered list that starts at 2. so does the output html <ol>
MarkdownIt.renderer.rules.ordered_list_open = (tokens, idx) => {
  const token = tokens[idx];
  const start = token?.attrGet("start") || "1";
  return `<ol start="${start}">`;
};

export const markdownToHtml = (markdown: string): string => {
  const normalized = markdown
    // Unicode characters can be encoded like \u2022 (bullet) -- we've seen cases where LLMs
    // mess up and accidentally escape the leading \, so sending \\u2022, so this regex detects this pattern and replaces
    // with the appropriate unicode character
    .replaceAll(/\\\\u([0-9a-f]{4})/gi, (_, hexCode) =>
      String.fromCharCode(parseInt(hexCode, 16))
    )
    // Sometimes the llm chooses to return a markdown table wrapped in a code block. This just removes any code block tagged as markdown
    .replaceAll(/```markdown\n((.|\n|\r)*)\n```/gim, (_, contents) => contents);
  const html = MarkdownIt.render(normalized);
  return DOMPurify.sanitize(html).trim();
};

// Create the MarkdownParser with the complete token mapping
const tokens = {
  ...defaultMarkdownParser.tokens,
  table: { block: "table" },
  tr: { block: "table_row" },
  th: { block: "table_header" },
  td: { block: "table_cell" },
  thead: { block: "table_head" },
  tbody: { block: "table_body" },
  s: { mark: "strikethrough" },
  html_inline: { node: "html_inline" },
};

export const markdownParser = new MarkdownParser(schema, MarkdownIt, tokens);
