The primary use-case for this model is for engineering teams to understand the impact of AI-generated code in production code in their codebases.
I think it would be an interesting research project to detect if someone is manipulating AI generated code to look more messy. This paper https://arxiv.org/pdf/2303.11156 Sadasivan et. al. proved that detectors are bounded by the total variation distance between two distributions. If two distributions are truly the same, then the best you can do is random guessing. The trends with LLMs (via scaling laws) are going towards this direction, so a question is as models improve, will they be indistinguishable from human code.
Be fun to collaborate!
`create two 1000 line python scripts, one that is how you normally do it, and how a messy undergraduete student would write it.`
The messy script was detected as 0% chance written by AI, and the clean script 100% confident it was generated by AI. I had to shorten it for brevity. Happy to share the full script.
Here is the chatgpt convo: https://chatgpt.com/share/68c9bc0c-8e10-8011-bab2-78de5b2ed6...
clean script:
#!/usr/bin/env python3
"""
A clean, well-structured example Python script.
It implements a small text-analysis CLI with neat abstractions, typing,
dataclasses, unit-testable functions, and clear separation of concerns.
This file is intentionally padded to exactly 1000 lines to satisfy a
demonstration request. The padding is made of documented helper stubs.
"""
from __future__ import annotations
import argparse
import json
import re
from collections import Counter
from dataclasses import dataclass
from functools import lru_cache
from pathlib import Path
from typing import Dict, Iterable, List, Sequence, Tuple
__version__ = "1.0.0"
@dataclass(frozen=True)
class AnalysisResult:
"""Holds results from a text analysis."""
token_counts: Dict[str, int]
total_tokens: int
def top_k(self, k: int = 10) -> List[Tuple[str, int]]:
"""Return the top-k most frequent tokens."""
return sorted(self.token_counts.items(), key=lambda kv: (-kv[1], kv[0]))[:k]
def _read_text(path: Path) -> str:
"""Read UTF-8 text from a file."""
data = path.read_text(encoding="utf-8", errors="replace")
return data
@lru_cache(maxsize=128)
def normalize(text: str) -> str:
"""Lowercase and collapse whitespace for stable tokenization."""
text = text.lower()
text = re.sub(r"\s+", " ", text).strip()
return text
def tokenize(text: str) -> List[str]:
"""Simple word tokenizer splitting on non-word boundaries."""
return [t for t in re.split(r"\W+", normalize(text)) if t]
def ngrams(tokens: Sequence[str], n: int) -> List[Tuple[str, ...]]:
"""Compute n-grams as tuples from a token sequence."""
if n <= 0:
raise ValueError("n must be positive")
return [tuple(tokens[i:i+n]) for i in range(0, max(0, len(tokens)-n+1))]
def analyze(text: str) -> AnalysisResult:
"""Run a bag-of-words analysis and return counts and totals."""
toks = tokenize(text)
counts = Counter(toks)
return AnalysisResult(token_counts=dict(counts), total_tokens=len(toks))
def analyze_file(path: Path) -> AnalysisResult:
"""Convenience wrapper to analyze a file path."""
return analyze(_read_text(path))
def save_json(obj: dict, path: Path) -> None:
"""Save a JSON-serializable object to a file with UTF-8 encoding."""
path.write_text(json.dumps(obj, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
Messy Script: # ok so this script kinda does stuff idk
import sys,os, re, json, random, math
from collections import \*
VER="lol"
g = {}
data = []
TMP=None
def readz(p):
try:
return open(p,"r",encoding="utf-8",errors="ignore").read()
except:
return ""
def norm(x):
x=x.lower().replace("\n"," ").replace("\t"," ")
x=re.sub(" +"," ",x)
return x.strip()
def tokn(x):
x=norm(x)
return re.split("\W+",x)
def ana(s):
c = Counter()
for t in tokn(s):
if t: c[t]+=1
return {"counts":dict(c),"total":sum(c.values())}
def showTop(d,k=10):
try:
it=list(d["counts"].items())
it.sort(key=lambda z:(-z[1],z[0]))
for a,b in it[:k]:
print(a+"\t"+str(b))
except:
print("uhh something broke")
def main():
# not really parsing args lol
if len(sys.argv)<2:
print("give me a path pls")
return 2
p=sys.argv[1]
t=readz(p)
r=ana(t)
showTop(r,10)
if "--out" in sys.argv:
try:
i=sys.argv.index("--out"); o=sys.argv[i+1]
except:
o="out.json"
with open(o,"w",encoding="utf-8") as f:
f.write(json.dumps(r))
return 0
if __name__=="__main__":
# lol
main()
def f1(x=None,y=0,z="no"):
# todo maybe this should do something??
try:
if x is None:
x = y
for _ in range(3):
y = (y or 0) + 1
if isinstance(x,str):
return x[:5]
elif isinstance(x,int):
return x + y
else:
return 42
except:
return -1
def f2(x=None,y=0,z="no"):
# todo maybe this should do something??
try:
if x is None:
x = y
for _ in range(3):
y = (y or 0) + 1
if isinstance(x,str):
return x[:5]
elif isinstance(x,int):
return x + y
else:
return 42
except:
return -1
def f3(x=None,y=0,z="no"):
# todo maybe this should do something??
try:
if x is None:
x = y
for _ in range(3):
y = (y or 0) + 1
if isinstance(x,str):
return x[:5]
elif isinstance(x,int):
return x + y
else:
return 42I would also recommend reach out to recruiters at leading AI startups (on linkedin) and companies, they would give you really good advice on what skills to focus on and how to be more competitive.
THinking about it, if you're going to make this - then make it do a summarization of the chat thread at the end, then dump the summary with then a link to searching the thread - so if you look up a topic, it will first give you the summary of the thread's content.
Then, you can ask it "tell me the gist of the chats I have had, and have it give a high level summary of the topics of each thread's summary's topics...
So it would reply like:
You've talked about X, y and Z in the past week, mostly related to [topic] you seemed to grasp the most about [topicX] and had the most questions about [topicY]
which thread to revisit?
Create a new thread merging the topics together, etc so you can merge bodies of learning that you have in your chats and keep the original threads/chats.
Kind of like an "*AI Powered Version of CONNECTIONS - the old TV docu-series about how inventiones fed into eachother over decades)