For my thesis, I wanted to make sure, that all my references, figures, etc. have consisten formatting regarding whitespaces and abbreviations. Furthermore I wanted to check to not have made any of the most common errors, such as duplicate words. For this, I created a script (borrowing from various websites), that basically runs a list of RegExes on the tex/source files, marking most of these things.
Again, I give this script as-is, hoping it may be useful for you. It sure helped me a lot. First, it defines a lot of regexes, then it simply runs each of them on all files and pipes the results to stdout, marking the "problematic" parts.
#!/bin/bash
# Match my (Self-defined) reference commands if they are not using a ~ (non-breaking space), e.g. the macro figref translated to Fig. or Figure depending on version of my thesis.
references="([^a-zA-Z0-9\\.,:;)}\$]~| ?[^~\\(])\\\\(cite|eqref|figref|tabref|secref|appref|onlinecite)\\{"
# Match regerences that are duplicating the name, e.g. Fig. \figref. \figref should contain the "Fig"
doublerefs="([Ff]ig(ure|\\.)|[Tt]ab(le|\\.)|[eE]q(uation|\\.)|[Ss]ec(tion|\\.)|[Aa]pp(endix|\\.)|[Rr]ef(erence|\\.))[ ~]?\\\\(cite|onlinecite|(eq|tab|sec|app)ref)\\{"
# References, wherein the macro type (e.g. figref) does not match the element references (e.g. a formula).
badrefs="(?:\\\\(fig|tab|app|sec|eq)ref\\{(?!\\1)|^\\\\((fig|tab|app|sec|eq)ref|cite|onlinecite)\\{)"
#Whitespaces before a colon, etc.
whitespaces="\\b (\\.|,)|(?<![0-9]{4})(\\.|,)[^0-9\\s\\w\\n}$]"
#Non-break spaces near a space
whitespaces2="(\\s~|~\\s)"
badBrackets=" \\("
specialexps="(e\\.\\sg\\.|i\\.\\se\\.|(e\\.g\\.|i\\.e\\.)[^~])|([^ \\n(](e\\.\\sg\\.|i\\.\\se\\.|(e\\.g\\.|i\\.e\\.)))"
#Formulas
shortformulasbody="((\\\\\\w+([_^]\\{?.{1,3}\\}?)?[+-=\\.]?)?([\d\.{}-+()^]|\\\\imag){1,10}|([+-=\\.]?\\\\\\w+(\\{.{1,10}\\}){1,3}){1,3})"
veryshortbody="(\\d{0,2}(\\w?|\\\\\\w*)([_^](\\{.{1,3}\\}|.{1,3})){0,2}[+-=\\.]?([\d\.{}-+()^_]|\\\\imag){0,4}|([+-=\\.]?\\\\\\w+(\\{.{1,4}\\}){1,3}){1,2})"
allshortformulas="\\\$$shortformulasbody\\\$"
endofsentenceformulas="\\\$$shortformulasbody([.,]\\\$|\\\$\\[,.])"
badendformulas="[^(~\`{]${endofsentenceformulas}"
badshortformulas="[^(~\`{]\\\$${veryshortbody}\\\$"
#Duplicate words
worddups="(?:\\b(\\w+)\\s+\\1\\b|\\b(\\w+\\s+\\w+)\\s+\\1\\b|\\ban?\\s+the\\b|\\bthe\\s+an?\\b|\\ban?\\s+an?\\b|([.,~])\\s+?\\1)"
######################HERE COMES THE ACTUAL CHECK
echo "Checking REF-FORMATTING (e.g. \"xyz\cite{}\" OR \" ~\figref{}\":"
egrep -i -n --color "($references)" $*
echo "Checking DOUBLED REFS (e.g. Fig.~\figref)"
egrep -i -n --color "($doublerefs)" $*
echo "Checking WHITESPACES (e.g. \"Sentence .\" OR \".Sentence\" ):"
egrep -i -n --color "($whitespaces)" $*
echo "Checking DOUBLE WHITSP. (e.g. \"~ \" OR \" ~\" ):"
egrep -i -n --color "($whitespaces2)" $*
echo "Checking SPECIAL EXP (e.g. \"e.g. \" OR \"e. g.\" ):"
egrep -i -n --color "($specialexps)" $*
echo "Checking END-FORMULAS (e.g. \"\$\matr{J}\$\") using $badendformulas:"
egrep -i -n --color "($badendformulas)" $*
#echo "Checking SHORT FORMULAS (e.g. \"\$\matr{J}\$\") using $badshortformulas:"
#egrep -i -n --color "($badshortformulas)" $*
echo "Checking Brackets for prepended ~ using $badBrackets:"
egrep -i -n --color "($badBrackets)" $*
#echo "Checking Lists (e.g. \"\$\matr{J}\$\") using $badLists:"
#egrep -i -n --color "($badLists)" $*
#egrep -i -n --color "(\\bwe\\b|\\bus\\b|\\b\\our\\b)" $*
echo "Checking un\\ed References or bad reftypes using $badrefs:"
grep -i -n -P --color "$badrefs" $*
echo "Checking Wordduplicates using $worddups:"
grep -i -P -n --color "$worddups" $*
exit $?