I created a macro in MS Office 2007 for cleaning and formatting. Please take a look at it and let me know how to improve it. I'm kinda out of idea right now. I think it is simple enough not to mess up the actual ebook flow.
And I'm really open to any new idea or if you have better solution. I really new at this macro stuff for a week now, I'm sure some of you might have a better solution.
Btw this forum is amazing. This is the first time I read ebook:). Easier to get a complete a series than if you go to bookstore.
rlaksana
- Code: Select all
' Main Macro:
'
' CleanAndFormat()
' CleanOCR()
' RemoveAllHyperlinks()
' Other macros are for supporting these there main macros.
' USE THIS FOR CLEANING AND FORMATTING EBOOK DOCUMENT WITH A GOOD PARAGRAPH LINE.
'
' Version change log:
' v2.1, July 30 - Added Indent right and left indent to 0 and LineSpacing = LinesToPoints(1)
' v2.2, July 30 - Added font size changes to all 12, because some bad document have mixed up font size + Set all font color to black.
' v2.3, August 01 - Minor changes, added join pages adjusment for font that is not Bold. (For skipping title or heading)
' v2.4, Aug 01 - Minor changes, Remove Paragraph Justify since mobipocket use automatic justification
' v2.5, Aug 02 - Added Single Non Breaking Space in for Convertion to Single Space.
' - Added a conversion from ^s into whitespace before conversion into single whitespace.
' v2.6, Aug 07 - Added comma as a clause for join if a paragraph end or start with a comma.
Sub CleanAndFormat()
' Simple macro.
Application.Run MacroName:="RemovePageAndSectionBreak"
' Convert manual line break to paragraph break.
Application.Run MacroName:="CovertLineBreakToParagraphBreak"
' Convert any space into single space.
Application.Run MacroName:="ConvertAllSpaceToSingleSpace"
' Remove all space before and after paragraph so that later on it can be use with JoinPages more successfully.
Application.Run MacroName:="RemoveAnySpaceBeforeOrAfterParagraph"
' Remove all extras paragraph in document so that join pages will be more successful.
Application.Run MacroName:="ConvertAllToSingleParagraph"
' Added >< on the document after each join for manual reviewing.
Application.Run MacroName:="JoinPages"
' Simple formatting.
Application.Run MacroName:="FormatDocument"
End Sub
' USE THIS ONLY ON OCR DRAFT DOCUMENT.
' Use this macro for cleaning a document that have double paragraph for each new line and single paragraph for line break.
' v1.0 - Original OCR.
' v1.1 - Added convert all ^p three or more into ^p^p, before joining hard break line.
Sub CleanOCR()
Application.Run MacroName:="CleanOCRStepOne"
Application.Run MacroName:="CleanOCRStepTwo"
End Sub
' Removing hyperlinks for some eBook that have a lot of broken links since it only include a single HTML
' or Merging file using toc but didn't not update the hyperlinks manually.
Sub RemoveAllHyperlinks()
Dim i As Integer
For i = ActiveDocument.Hyperlinks.Count To 1 Step -1
ActiveDocument.Hyperlinks(i).Delete
Next i
End Sub
' - Convert all whitespace and non breaking space into single whitespace
' - Remove any whitespace before or after a paragraph.
' - Conversion into a single paragraph.
Sub Tidy()
Application.Run MacroName:="ConvertAllSpaceToSingleSpace"
Application.Run MacroName:="RemoveAnySpaceBeforeOrAfterParagraph"
Application.Run MacroName:="ConvertAllToSingleParagraph"
End Sub
' --------------------------------------------------------------------------
' BELOW IS INDIVIDUAL SUB FOR RUNNING SPECIFIC MACRO.
' --------------------------------------------------------------------------
Sub CleanOCRStepTwo()
' Convert any space into single space.
Application.Run MacroName:="ConvertAllSpaceToSingleSpace"
' Need to run this method one more time because in step one, I can only remove 2 ^p and it will added space after ^p.
' Remove all space before and after paragraph so that later on it can be use with JoinPages more successfully.
Application.Run MacroName:="RemoveAnySpaceBeforeOrAfterParagraph"
' Remove all extras paragraph in document so that join pages will be more successful.
Application.Run MacroName:="ConvertAllToSingleParagraph"
' Need more attention.
Application.Run MacroName:="JoinPages"
' Simple formatting.
Application.Run MacroName:="FormatDocument"
End Sub
Sub CleanOCRStepOne()
Application.Run MacroName:="RemovePageAndSectionBreak"
Application.Run MacroName:="CovertLineBreakToParagraphBreak"
Application.Run MacroName:="RemoveAnySpaceBeforeOrAfterParagraph"
Selection.Find.ClearFormatting
Selection.Find.Replacement.ClearFormatting
With Selection.Find
.Text = "^13{3,}"
.Replacement.Text = "^p^p"
.Forward = True
.Wrap = wdFindContinue
.Format = False
.MatchCase = False
.MatchWholeWord = False
.MatchWildcards = True
.MatchSoundsLike = False
.MatchAllWordForms = False
End With
Selection.Find.Execute Replace:=wdReplaceAll
With Selection.Find
.Text = "^p^p"
.Replacement.Text = "{br}"
.Forward = True
.Wrap = wdFindContinue
.Format = False
.MatchCase = False
.MatchWholeWord = False
.MatchWildcards = False
.MatchSoundsLike = False
.MatchAllWordForms = False
End With
Selection.Find.Execute Replace:=wdReplaceAll
With Selection.Find
.Text = "^p"
.Replacement.Text = " "
.Forward = True
.Wrap = wdFindContinue
.Format = False
.MatchCase = False
.MatchWholeWord = False
.MatchWildcards = False
.MatchSoundsLike = False
.MatchAllWordForms = False
End With
Selection.Find.Execute Replace:=wdReplaceAll
With Selection.Find
.Text = "{br}"
.Replacement.Text = "^p"
.Forward = True
.Wrap = wdFindContinue
.Format = False
.MatchCase = False
.MatchWholeWord = False
.MatchWildcards = False
.MatchSoundsLike = False
.MatchAllWordForms = False
End With
Selection.Find.Execute Replace:=wdReplaceAll
End Sub
' Important to support join page.
Sub RemovePageAndSectionBreak()
Selection.Find.ClearFormatting
Selection.Find.Replacement.ClearFormatting
With Selection.Find
.Text = "^b"
.Replacement.Text = "^p"
.Forward = True
.Wrap = wdFindContinue
.Format = False
.MatchCase = False
.MatchWholeWord = False
.MatchWildcards = False
.MatchSoundsLike = False
.MatchAllWordForms = False
End With
Selection.Find.Execute Replace:=wdReplaceAll
With Selection.Find
.Text = "^m"
.Replacement.Text = "^p"
.Forward = True
.Wrap = wdFindContinue
.Format = False
.MatchCase = False
.MatchWholeWord = False
.MatchWildcards = False
.MatchSoundsLike = False
.MatchAllWordForms = False
End With
Selection.Find.Execute Replace:=wdReplaceAll
End Sub
' Important to support join page.
Sub RemoveAnySpaceBeforeOrAfterParagraph()
Selection.Find.ClearFormatting
Selection.Find.Replacement.ClearFormatting
With Selection.Find
.Text = "[ ^s]{1,}^13"
.Replacement.Text = "^p"
.Forward = True
.Wrap = wdFindContinue
.Format = False
.MatchCase = False
.MatchWholeWord = False
.MatchAllWordForms = False
.MatchSoundsLike = False
.MatchWildcards = True
End With
Selection.Find.Execute Replace:=wdReplaceAll
With Selection.Find
.Text = "^13[ ^s]{1,}"
.Replacement.Text = "^p"
.Forward = True
.Wrap = wdFindContinue
.Format = False
.MatchCase = False
.MatchWholeWord = False
.MatchAllWordForms = False
.MatchSoundsLike = False
.MatchWildcards = True
End With
Selection.Find.Execute Replace:=wdReplaceAll
End Sub
' Important to support join pages.
Sub ConvertAllSpaceToSingleSpace()
Selection.Find.ClearFormatting
Selection.Find.Replacement.ClearFormatting
With Selection.Find
.Text = "^s"
.Replacement.Text = " "
.Forward = True
.Wrap = wdFindContinue
.Format = False
.MatchCase = False
.MatchWholeWord = False
.MatchAllWordForms = False
.MatchSoundsLike = False
.MatchWildcards = False
End With
Selection.Find.Execute Replace:=wdReplaceAll
With Selection.Find
.Text = " {2,}"
.Replacement.Text = " "
.Forward = True
.Wrap = wdFindContinue
.Format = False
.MatchCase = False
.MatchWholeWord = False
.MatchAllWordForms = False
.MatchSoundsLike = False
.MatchWildcards = True
End With
Selection.Find.Execute Replace:=wdReplaceAll
With Selection.Find
.Text = " "
.Replacement.Text = " "
.Forward = True
.Wrap = wdFindContinue
.Format = False
.MatchCase = False
.MatchWholeWord = False
.MatchAllWordForms = False
.MatchSoundsLike = False
.MatchWildcards = False
End With
Selection.Find.Execute Replace:=wdReplaceAll
End Sub
' Joining pages where in original book it's separated and in eBook version
' the paragraph should be together not in different paragraph.
Sub JoinPages()
Selection.Find.ClearFormatting
Selection.Find.Replacement.ClearFormatting
' Needed for skipping headings, title and any bold text
Selection.Find.Font.Bold = False
' Step 1: join lower case letter in the first line that have a line break before it.
With Selection.Find
.Text = "^13[a-z,]"
.Replacement.Text = " ><^&"
.Forward = True
.Wrap = wdFindContinue
.Format = True
.MatchCase = False
.MatchWholeWord = False
.MatchAllWordForms = False
.MatchSoundsLike = False
.MatchWildcards = True
End With
Selection.Find.Execute Replace:=wdReplaceAll
With Selection.Find
.Text = " [>][<]*^13"
.Replacement.Text = " >< "
.Forward = True
.Wrap = wdFindContinue
.Format = True
.MatchCase = False
.MatchWholeWord = False
.MatchAllWordForms = False
.MatchSoundsLike = False
.MatchWildcards = True
End With
Selection.Find.Execute Replace:=wdReplaceAll
' Step 2: join a letter that has a lower case letter before its paragraph
With Selection.Find
.Text = "[,a-z]^13"
'foo^p><
.Replacement.Text = "^&>< "
.Forward = True
.Wrap = wdFindContinue
.Format = True
.MatchCase = False
.MatchWholeWord = False
.MatchAllWordForms = False
.MatchSoundsLike = False
.MatchWildcards = True
End With
Selection.Find.Execute Replace:=wdReplaceAll
With Selection.Find
.Text = "^13[>][<] "
.Replacement.Text = " >< "
.Forward = True
.Wrap = wdFindContinue
.Format = True
.MatchCase = False
.MatchWholeWord = False
.MatchAllWordForms = False
.MatchSoundsLike = False
.MatchWildcards = True
End With
Selection.Find.Execute Replace:=wdReplaceAll
End Sub
Sub FormatDocument()
Selection.WholeStory
' Remove Paragraph Justify since mobipocket use automatic justification
' Selection.ParagraphFormat.Alignment = wdAlignParagraphJustify
' Font Styles -- Important to set font size 12 since some eBook contains normal text but different size which is a mistake.
Selection.Font.Size = 12
Selection.Font.Color = -587137025
' Important: some eBook contains mixed up line spacing in normal paragraph, so I need to set it to 1.
Selection.ParagraphFormat.LineSpacing = LinesToPoints(1)
With Selection.PageSetup
.LineNumbering.Active = False
.Orientation = wdOrientPortrait
.TopMargin = InchesToPoints(0.5)
.BottomMargin = InchesToPoints(0.5)
.LeftMargin = InchesToPoints(0.5)
.RightMargin = InchesToPoints(0.5)
.Gutter = InchesToPoints(0)
.HeaderDistance = InchesToPoints(0.5)
.FooterDistance = InchesToPoints(0.5)
.PageWidth = InchesToPoints(8.5)
.PageHeight = InchesToPoints(11)
.FirstPageTray = wdPrinterDefaultBin
.OtherPagesTray = wdPrinterDefaultBin
.SectionStart = wdSectionNewPage
.OddAndEvenPagesHeaderFooter = False
.DifferentFirstPageHeaderFooter = False
.VerticalAlignment = wdAlignVerticalTop
.SuppressEndnotes = False
.MirrorMargins = False
.TwoPagesOnOne = False
.BookFoldPrinting = False
.BookFoldRevPrinting = False
.BookFoldPrintingSheets = 1
.GutterPos = wdGutterPosLeft
End With
With Selection.ParagraphFormat
.SpaceBeforeAuto = False
.SpaceAfterAuto = False
.FirstLineIndent = InchesToPoints(0.13)
.RightIndent = InchesToPoints(0)
.LeftIndent = InchesToPoints(0)
End With
WordBasic.OpenOrCloseParaBelow
WordBasic.OpenOrCloseParaAbove
WordBasic.OpenOrCloseParaBelow
WordBasic.OpenOrCloseParaAbove
End Sub
' Important to support join pages.
Sub ConvertAllToSingleParagraph()
Selection.Find.ClearFormatting
Selection.Find.Replacement.ClearFormatting
With Selection.Find
.Text = "^13{2,}"
.Replacement.Text = "^p"
.Forward = True
.Wrap = wdFindContinue
.Format = False
.MatchCase = False
.MatchWholeWord = False
.MatchAllWordForms = False
.MatchSoundsLike = False
.MatchWildcards = True
End With
Selection.Find.Execute Replace:=wdReplaceAll
End Sub
' Important to support join pages.
Sub CovertLineBreakToParagraphBreak()
Selection.Find.ClearFormatting
Selection.Find.Replacement.ClearFormatting
With Selection.Find
.Text = "^l"
.Replacement.Text = "^p"
.Forward = True
.Wrap = wdFindContinue
.Format = False
.MatchCase = False
.MatchWholeWord = False
.MatchWildcards = False
.MatchSoundsLike = False
.MatchAllWordForms = False
End With
Selection.Find.Execute Replace:=wdReplaceAll
End Sub